├── .gitignore ├── CMakeLists.txt ├── GNUmakefile ├── LICENSE ├── README.md ├── c_src └── sync_nif.c ├── examples └── chronicled │ ├── .gitignore │ ├── README.md │ ├── rebar.config │ └── src │ ├── chronicled.app.src │ ├── chronicled_app.erl │ ├── chronicled_server.erl │ └── chronicled_sup.erl ├── rebar.config ├── rebar.lock ├── rebar2.config ├── rebar2.config.script ├── scripts ├── chronicle_dump │ └── chronicle_dump.erl └── test_leader_remove_addback.sh ├── src ├── chronicle.app.src ├── chronicle.erl ├── chronicle.hrl ├── chronicle_agent.erl ├── chronicle_agent_sup.erl ├── chronicle_app.erl ├── chronicle_catchup.erl ├── chronicle_config.erl ├── chronicle_config_rsm.erl ├── chronicle_env.erl ├── chronicle_ets.erl ├── chronicle_events.erl ├── chronicle_failover.erl ├── chronicle_kv.erl ├── chronicle_leader.erl ├── chronicle_log.erl ├── chronicle_logger_filter.erl ├── chronicle_peers.erl ├── chronicle_proposer.erl ├── chronicle_rsm.erl ├── chronicle_rsm_sup.erl ├── chronicle_secondary_restartable_sup.erl ├── chronicle_secondary_sup.erl ├── chronicle_server.erl ├── chronicle_settings.erl ├── chronicle_single_rsm_sup.erl ├── chronicle_snapshot_mgr.erl ├── chronicle_stats.erl ├── chronicle_status.erl ├── chronicle_storage.erl ├── chronicle_sup.erl ├── chronicle_utils.erl └── dynamic_supervisor.erl ├── start_cluster └── test ├── chronicle_log_tests.erl ├── chronicle_peers_vnet.erl ├── chronicle_tests.erl ├── dynamic_supervisor_tests.erl └── misc_tests.erl /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.beam 3 | /_build/ 4 | /TAGS 5 | /TAGS.root 6 | /cluster/ 7 | /priv/ 8 | /c_src/*.d 9 | /c_src/*.o 10 | /tmp/ 11 | 12 | # rebar2 13 | .rebar/ 14 | /chronicle_dump 15 | compile_commands.json 16 | ebin 17 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Used when built as part of couchbase server only. 2 | 3 | if(NOT DEFINED REBAR_VERSION) 4 | # Default to building with rebar2 (the old behaviour) 5 | SET(REBAR_VERSION "rebar2") 6 | endif() 7 | 8 | IF (${REBAR_VERSION} STREQUAL "rebar2") 9 | # Config override only needed for rebar2 10 | SET (REBAR_OPTS -C "${CMAKE_CURRENT_SOURCE_DIR}/rebar2.config") 11 | ENDIF() 12 | 13 | IF (NOT DEFINED INST_LIBDIR) 14 | MESSAGE(FATAL_ERROR "INST_LIBDIR is not defined.") 15 | ENDIF () 16 | 17 | IF (NOT TARGET ns_realclean) 18 | MESSAGE(FATAL_ERROR "ns_realclean target does not exist.") 19 | ENDIF () 20 | 21 | REBAR (TARGET chronicle REBAR_OPTS ${REBAR_OPTS} CLEAN_HOOK ns_realclean) 22 | REBAR (TARGET chronicle_dump REBAR_OPTS ${REBAR_OPTS} 23 | COMMAND escriptize NOCLEAN DEPENDS chronicle) 24 | 25 | SET(INSTALL_DIR "${INST_LIBDIR}/chronicle") 26 | 27 | IF (${REBAR_VERSION} STREQUAL "rebar2") 28 | INSTALL (DIRECTORY ebin DESTINATION "${INSTALL_DIR}") 29 | INSTALL (PROGRAMS chronicle_dump DESTINATION bin) 30 | ELSE() 31 | INSTALL (DIRECTORY _build/default/lib/chronicle/ebin DESTINATION "${INSTALL_DIR}") 32 | INSTALL (PROGRAMS _build/default/bin/chronicle_dump DESTINATION bin) 33 | ENDIF() 34 | 35 | # priv/ only exists on Linux currently, so make this OPTIONAL 36 | INSTALL (DIRECTORY priv OPTIONAL DESTINATION "${INSTALL_DIR}") 37 | -------------------------------------------------------------------------------- /GNUmakefile: -------------------------------------------------------------------------------- 1 | SOURCE_DIR = src 2 | SOURCES = $(wildcard ${SOURCE_DIR}/*.[he]rl) 3 | 4 | TAGS: $(SOURCES) 5 | erl -src_dir "${SOURCE_DIR}" -tag_file "$@" -noinput \ 6 | -eval "{ok, SrcDir} = init:get_argument(src_dir), \ 7 | {ok, TagFile} = init:get_argument(tag_file), \ 8 | tags:subdir(SrcDir, [{outfile, TagFile}]), \ 9 | init:stop(0)." 10 | 11 | .PHONY: TAGS.root 12 | TAGS.root: 13 | erl -tag_file "$@" -noinput \ 14 | -eval "{ok, TagFile} = init:get_argument(tag_file), \ 15 | tags:root([{outfile, TagFile}]), \ 16 | init:stop(0)." 17 | 18 | .PHONY: watch 19 | watch: 20 | ls $(SOURCE_DIR)/*.[he]rl | entr -n $(MAKE) TAGS 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Chronicle is an implementation of Raft consensus protocol that Couchbase 2 | Server uses to manage its internal metadata. 3 | 4 | Notable features: 5 | 6 | * Supports voting and non-voting participants. 7 | * Implements pre-vote protocol to prevent individual nodes from starting 8 | elections unnecessarily. 9 | * Supports multiple pluggable state machines that run on a single replicated 10 | log. 11 | * Reads are served by the local replica by default, providing sequential 12 | consistency. Linearizable reads are also supported. 13 | * Supports exactly-once writes. 14 | * Extends Raft to support quorum-loss disaster recovery: through manual 15 | intervention, it's possible to make the cluster available for writes again 16 | even when the majority of nodes are unavailable. The surviving nodes may 17 | not have all updates (violating linearizability), but all updates that made 18 | it to them will be preserved. The surviving nodes will be insulated from 19 | the failed-over nodes so no new updates from them may propagate. 20 | -------------------------------------------------------------------------------- /c_src/sync_nif.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Couchbase 3 | * @copyright 2020 Couchbase, Inc. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | **/ 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #include "erl_driver.h" 26 | #include "erl_nif.h" 27 | 28 | static ERL_NIF_TERM am_ok; 29 | static ERL_NIF_TERM am_error; 30 | static ERL_NIF_TERM am_close_failed; 31 | 32 | 33 | static ERL_NIF_TERM 34 | make_errno_atom(ErlNifEnv *env, int errnum) 35 | { 36 | return enif_make_atom(env, erl_errno_id(errnum)); 37 | } 38 | 39 | static ERL_NIF_TERM 40 | make_errno_error(ErlNifEnv *env, int errnum) 41 | { 42 | return enif_make_tuple2(env, 43 | am_error, 44 | make_errno_atom(env, errnum)); 45 | } 46 | 47 | static ERL_NIF_TERM 48 | do_sync_dir(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) 49 | { 50 | ErlNifBinary path; 51 | 52 | if (!enif_inspect_binary(env, argv[0], &path)) { 53 | return enif_make_badarg(env); 54 | } 55 | 56 | int fd; 57 | 58 | while (true) { 59 | fd = open((char *) path.data, O_RDONLY | O_DIRECTORY); 60 | if (fd >= 0) { 61 | break; 62 | } 63 | 64 | if (errno != EINTR) { 65 | return make_errno_error(env, errno); 66 | } 67 | } 68 | 69 | int sync_ret = fsync(fd); 70 | int sync_errno = errno; 71 | 72 | if (close(fd) != 0) { 73 | ERL_NIF_TERM reason = enif_make_tuple2(env, 74 | am_close_failed, 75 | make_errno_atom(env, errno)); 76 | 77 | return enif_raise_exception(env, reason); 78 | } 79 | 80 | if (sync_ret == 0) { 81 | return am_ok; 82 | } else { 83 | return make_errno_error(env, sync_errno); 84 | } 85 | } 86 | 87 | static ErlNifFunc nif_functions[] = { 88 | {"do_sync_dir", 1, do_sync_dir, ERL_NIF_DIRTY_JOB_IO_BOUND}, 89 | }; 90 | 91 | static int 92 | load(ErlNifEnv *env, void** priv_data, ERL_NIF_TERM load_info) 93 | { 94 | am_ok = enif_make_atom(env, "ok"); 95 | am_error = enif_make_atom(env, "error"); 96 | am_close_failed = enif_make_atom(env, "close_failed"); 97 | 98 | return 0; 99 | } 100 | 101 | ERL_NIF_INIT(chronicle_utils, nif_functions, load, NULL, NULL, NULL); 102 | -------------------------------------------------------------------------------- /examples/chronicled/.gitignore: -------------------------------------------------------------------------------- 1 | .rebar3 2 | _* 3 | .eunit 4 | *.o 5 | *.beam 6 | *.plt 7 | *.swp 8 | *.swo 9 | .erlang.cookie 10 | ebin 11 | log 12 | erl_crash.dump 13 | .rebar 14 | logs 15 | _build 16 | .idea 17 | *.iml 18 | rebar3.crashdump 19 | *~ 20 | -------------------------------------------------------------------------------- /examples/chronicled/README.md: -------------------------------------------------------------------------------- 1 | # Example 2 | 3 | The chronicle library has very few dependencies, which is good for reuse but 4 | can mean it's hard to get started with. This example allows you to start a 5 | REST API server on a collection of nodes that drives a chronicle process. 6 | 7 | ## Build 8 | 9 | You build all examples as follows (at the top-level directory): 10 | 11 | `rebar3 as examples compile` 12 | 13 | There are instructions to get rebar3 on your system at: https://github.com/erlang/rebar3. 14 | Given that you'll need Erlang on your system to run chronicle, the easiest 15 | thing to do is probably to build it and install it locally via: 16 | 17 | ``` 18 | $ git clone https://github.com/erlang/rebar3.git 19 | $ cd rebar3 20 | $ ./bootstrap 21 | $ rebar3 local install 22 | ``` 23 | 24 | ## Start a cluster of example nodes 25 | 26 | Run: 27 | 28 | `start_cluster --app chronicled --num-nodes N --hostname 127.0.0.1` 29 | 30 | This will start a cluster of N example nodes listening on the loopback 31 | interface. The `--app chronicled` argument instructs the script to start the 32 | example application named `chronicled` - the script can also be used to nodes 33 | running only chronicle. 34 | 35 | The i-th node in the cluster is: 36 | - named `chronicle_i@127.0.0.1` 37 | - listens on port `(8080+i)` 38 | 39 | ## Provision one node 40 | 41 | Run: 42 | 43 | `curl -i -H "Content-Type: application/json" 127.0.0.1:8080/config/provision` 44 | 45 | This will "provision" node 0, that is, turns node 0 from an uninitialized node 46 | to an initialized one node cluster running chronicle. One replicated state 47 | machine is provisioned with name `kv`. 48 | 49 | ## Check the configuration 50 | 51 | Run: 52 | 53 | `curl -i -H "Content-Type: application/json" 127.0.0.1:8080/config/info` 54 | 55 | You should see something like: 56 | 57 | ``` 58 | HTTP/1.1 200 OK 59 | content-length: 60 60 | content-type: application/json 61 | date: Fri, 25 Sep 2020 05:41:28 GMT 62 | server: Cowboy 63 | {"voters":["chronicle_0@127.0.0.1"]}. 64 | ``` 65 | 66 | ## Add a key-value pair 67 | 68 | Run: 69 | 70 | `curl -i -H "Content-Type: application/json" 127.0.0.1:8080/kv/key -X PUT -d '1'` 71 | 72 | 73 | ## Get the value associated with a key 74 | 75 | Run: 76 | 77 | `curl -i -H "Content-Type: application/json" 127.0.0.1:8080/kv/key` 78 | 79 | You should see something like this: 80 | 81 | ``` 82 | HTTP/1.1 200 OK 83 | content-length: 81 84 | content-type: application/json 85 | date: Fri, 25 Sep 2020 04:10:16 GMT 86 | server: Cowboy 87 | {"rev":{"history_id":"6e4d2640cbe41b818bb5af4407142be9","seqno":2},"value":1} 88 | ``` 89 | 90 | ## Update a value 91 | 92 | Run: 93 | 94 | `curl -i -H "Content-Type: application/json" 127.0.0.1:8080/kv/key -X POST -d '{"value": 1}'` 95 | 96 | PUTs are used to add key-value pairs; POSTs are used to update the value. Note 97 | that the value can be set to arbitrary JSON. 98 | 99 | 100 | ## Add nodes 101 | 102 | To add one node, run: 103 | 104 | `curl -i -H "Content-Type: application/json" 127.0.0.1:8080/config/addnode -d '"chronicle_1@127.0.0.1"'` 105 | 106 | To add two, run: 107 | 108 | `curl -i -H "Content-Type: application/json" 127.0.0.1:8080/config/addnode 109 | -d '["chronicle_1@127.0.0.1", "chronicle_2@127.0.0.1"]'` 110 | 111 | Check the configuration from the newly added node: 112 | 113 | ``` 114 | $ curl -i -H "Content-Type: application/json" localhost:8081/config/info 115 | HTTP/1.1 200 OK 116 | content-length: 60 117 | content-type: application/json 118 | date: Fri, 25 Sep 2020 05:41:28 GMT 119 | server: Cowboy 120 | {"voters":["chronicle_0@127.0.0.1","chronicle_1@127.0.0.1"]}. 121 | ``` 122 | 123 | Verify that the new nodes also return the value associated with the key. Run: 124 | 125 | 126 | `curl -i -H "Content-Type: application/json" 127.0.0.1:8081/kv/key` 127 | 128 | Again you should see something like: 129 | 130 | ``` 131 | HTTP/1.1 200 OK 132 | content-length: 81 133 | content-type: application/json 134 | date: Fri, 25 Sep 2020 04:42:46 GMT 135 | server: Cowboy 136 | {"rev":{"history_id":"6e4d2640cbe41b818bb5af4407142be9","seqno":6},"value":{"value":1} 137 | ``` 138 | 139 | ## Remove nodes 140 | 141 | To remove a node, run: 142 | 143 | `curl -i -H "Content-Type: application/json" 127.0.0.1:8080/config/removenode 144 | -d '"chronicle_0@127.0.0.1"'` 145 | 146 | ## Delete a key 147 | 148 | Run: 149 | 150 | `curl -i -H "Content-Type: application/json" 127.0.0.1:8081/kv/key -X DELETE` 151 | -------------------------------------------------------------------------------- /examples/chronicled/rebar.config: -------------------------------------------------------------------------------- 1 | {erl_opts, [debug_info]}. 2 | {deps, [chronicle, 3 | {cowboy, 4 | {git, "https://github.com/ninenines/cowboy", 5 | {tag, "2.8.0"}}}, 6 | {jiffy, {git, "git@github.com:davisp/jiffy.git", 7 | {tag, "1.0.5"}}}]}. 8 | {shell, [ 9 | {apps, [chronicled]} 10 | ]}. 11 | -------------------------------------------------------------------------------- /examples/chronicled/src/chronicled.app.src: -------------------------------------------------------------------------------- 1 | {application, chronicled, 2 | [{description, "An example on how to use chronicle."}, 3 | {vsn, "0.1.0"}, 4 | {registered, []}, 5 | {mod, {chronicled_app, []}}, 6 | {applications, 7 | [kernel, 8 | stdlib, 9 | chronicle, 10 | ranch 11 | ]}, 12 | {env,[]}, 13 | {modules, []}, 14 | 15 | {licenses, ["Apache 2.0"]}, 16 | {links, []} 17 | ]}. 18 | -------------------------------------------------------------------------------- /examples/chronicled/src/chronicled_app.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2020 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | -module(chronicled_app). 17 | 18 | -behaviour(application). 19 | 20 | -export([start/2, stop/1]). 21 | 22 | start(_StartType, _StartArgs) -> 23 | chronicled_sup:start_link(). 24 | 25 | stop(_State) -> 26 | ok. 27 | 28 | %% internal functions 29 | -------------------------------------------------------------------------------- /examples/chronicled/src/chronicled_sup.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2020 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | -module(chronicled_sup). 17 | 18 | -behaviour(supervisor). 19 | 20 | -export([start_link/0]). 21 | -export([init/1]). 22 | 23 | -define(SERVER, ?MODULE). 24 | 25 | start_link() -> 26 | supervisor:start_link({local, ?SERVER}, ?MODULE, []). 27 | 28 | init([]) -> 29 | SupFlags = #{strategy => one_for_all, 30 | intensity => 0, 31 | period => 1}, 32 | ChildSpecs = [#{id => chronicled_server, 33 | start => {chronicled_server, start, []}, 34 | type => worker}], 35 | {ok, {SupFlags, ChildSpecs}}. 36 | -------------------------------------------------------------------------------- /rebar.config: -------------------------------------------------------------------------------- 1 | %% If you make changes to this file, make sure to make corresponding changes 2 | %% to rebar2.config as well (if applicable). 3 | 4 | {profiles, 5 | [{test, [{deps, 6 | [{vnet, 7 | {git, "https://github.com/couchbasedeps/vnet.git", 8 | {branch, "master"}}}] 9 | }, 10 | {dialyzer, [{plt_extra_apps, [eunit, vnet]}]}] 11 | }, 12 | {examples, [{project_app_dirs, ["examples/*", "."]}, 13 | {dialyzer, [{plt_extra_apps, [cowboy, jiffy]}]}]}] 14 | }. 15 | 16 | {escript_main_app, chronicle}. 17 | {escript_name, chronicle_dump}. 18 | {escript_emu_args, "%%! -escript main chronicle_dump\n"}. 19 | 20 | {src_dirs, ["src", "scripts/chronicle_dump"]}. 21 | {erl_opts, [debug_info, warn_export_vars, 22 | {platform_define, "linux", 'HAVE_SYNC_DIR'}]}. 23 | {erl_first_files, ["src/dynamic_supervisor.erl"]}. 24 | {minimum_otp_vsn, "22"}. 25 | {dialyzer, [{warnings, 26 | [error_handling, 27 | race_conditions, 28 | unmatched_returns, 29 | unknown]}]}. 30 | 31 | {xref_extra_paths, ["test"]}. 32 | {xref_checks,[undefined_function_calls, 33 | undefined_functions, 34 | locals_not_used, 35 | deprecated_function_calls, 36 | deprecated_functions]}. 37 | {plugins, [pc]}. 38 | {provider_hooks, 39 | [{pre, [{compile, {pc, compile}}, 40 | {clean, {pc, clean}}]}, 41 | {post, [{compile, escriptize}]}]}. 42 | {port_specs, [{"linux", "priv/sync_nif.so", ["c_src/sync_nif.c"]}]}. 43 | {port_env, [{"CFLAGS", "$CFLAGS -Wall -Wno-unused-command-line-argument -Werror -std=gnu99"}]}. 44 | 45 | %% We need to define artifacts declarations to ensure that these files 46 | %% get built because rebar3 is only capable of detecting that beam 47 | %% files have been recompiled. 48 | {artifacts, ["{{profile_dir}}/bin/chronicle_dump"]}. 49 | -------------------------------------------------------------------------------- /rebar.lock: -------------------------------------------------------------------------------- 1 | []. 2 | -------------------------------------------------------------------------------- /rebar2.config: -------------------------------------------------------------------------------- 1 | %% This is a rebar2 compatible rebar.config file used as part of the 2 | %% couchbase-server build. If you change something here, make sure to make 3 | %% corresponding changes to rebar.config as well. 4 | 5 | {escript_main_app, chronicle}. 6 | {escript_name, chronicle_dump}. 7 | {escript_emu_args, "%%! -escript main chronicle_dump\n"}. 8 | 9 | {erl_opts, [debug_info, warn_export_vars, 10 | {i, ["src"]}, 11 | {src_dirs, ["src", "scripts/chronicle_dump"]}, 12 | {platform_define, "linux", 'HAVE_SYNC_DIR'}]}. 13 | {erl_first_files, ["src/dynamic_supervisor.erl"]}. 14 | 15 | {port_specs, [{"linux", "priv/sync_nif.so", ["c_src/sync_nif.c"]}]}. 16 | {port_env, [{"CFLAGS", "$CFLAGS -Wall -Wno-unused-command-line-argument -Werror -std=gnu99"}]}. 17 | -------------------------------------------------------------------------------- /rebar2.config.script: -------------------------------------------------------------------------------- 1 | Release = erlang:system_info(otp_release), 2 | try list_to_integer(Release) of 3 | R when R > 22 -> 4 | PortEnv = proplists:get_value(port_env, CONFIG, []), 5 | NewPortEnv = [{"ERL_LDFLAGS", "-L$ERL_EI_LIBDIR -lei"} | PortEnv], 6 | lists:keystore(port_env, 1, CONFIG, {port_env, NewPortEnv}); 7 | _ -> 8 | CONFIG 9 | catch 10 | error:badarg -> 11 | CONFIG 12 | end. 13 | -------------------------------------------------------------------------------- /scripts/chronicle_dump/chronicle_dump.erl: -------------------------------------------------------------------------------- 1 | -module(chronicle_dump). 2 | 3 | -include("chronicle.hrl"). 4 | 5 | -export([main/1]). 6 | -export([raw/1]). 7 | 8 | -define(fmt(Msg), ?fmt(Msg, [])). 9 | -define(fmt(Fmt, Args), ?fmt(group_leader(), Fmt, Args)). 10 | -define(fmt(IoDevice, Fmt, Args), io:format(IoDevice, Fmt ++ "~n", Args)). 11 | 12 | -define(error(Msg), ?error(Msg, [])). 13 | -define(error(Fmt, Args), ?fmt(standard_error, Fmt, Args)). 14 | 15 | -define(RAW_TAG, '$chronicle_dump_raw'). 16 | 17 | -define(STATUS_OK, 0). 18 | -define(STATUS_FATAL, 1). 19 | -define(STATUS_ERROR, 2). 20 | 21 | raw(Term) -> 22 | {?RAW_TAG, Term}. 23 | 24 | parse_args(Args, Spec0) -> 25 | Spec = maps:from_list( 26 | lists:map( 27 | fun ({Opt, Type}) -> 28 | {atom_to_list(Opt), Type} 29 | end, maps:to_list(Spec0))), 30 | parse_args_loop(Args, [], #{}, Spec). 31 | 32 | parse_args_loop([], AccArgs, AccOptions, _Spec) -> 33 | {lists:reverse(AccArgs), AccOptions}; 34 | parse_args_loop([Arg|Args], AccArgs, AccOptions, Spec) -> 35 | case Arg of 36 | "--" -> 37 | {lists:reverse(AccArgs, Args), AccOptions}; 38 | "--" ++ Option -> 39 | case maps:find(Option, Spec) of 40 | {ok, flag} -> 41 | Opt = list_to_atom(Option), 42 | parse_args_loop(Args, AccArgs, 43 | AccOptions#{Opt => true}, 44 | Spec); 45 | {ok, {option, Fun}} -> 46 | parse_args_option(Arg, Option, Fun, 47 | Args, AccArgs, AccOptions, Spec); 48 | error -> 49 | usage("unknown option '~s'", [Arg]) 50 | end; 51 | _ -> 52 | parse_args_loop(Args, [Arg | AccArgs], AccOptions, Spec) 53 | end. 54 | 55 | parse_args_option(Option, _Name, _Fun, [], _AccArgs, _AccOption, _Spec) -> 56 | usage("argument required for '~s'", [Option]); 57 | parse_args_option(Option, Name, Fun, [Arg|Args], AccArgs, AccOptions, Spec) -> 58 | case Fun(Arg) of 59 | {ok, Result} -> 60 | NewAccOptions = AccOptions#{list_to_atom(Name) => Result}, 61 | parse_args_loop(Args, AccArgs, NewAccOptions, Spec); 62 | {error, Error} -> 63 | usage("invalid argument '~s' for '~s': ~w", 64 | [Arg, Option, Error]) 65 | end. 66 | 67 | dump_logs(Args) -> 68 | {Paths, Options} = parse_args(Args, 69 | #{sanitize => {option, fun sanitize_opt/1}, 70 | decrypt => {option, fun decrypt_opt/1}}), 71 | DecryptFun = maps:get(decrypt, Options, fun (D) -> {ok, D} end), 72 | chronicle_env:setup_decrypt_function(DecryptFun), 73 | dump_many(Paths, 74 | fun (Path) -> 75 | dump_log(Path, Options) 76 | end). 77 | 78 | dump_log(Path, Options) -> 79 | ?fmt("Dumping '~s'~n", [Path]), 80 | SanitizeFun = 81 | case maps:find(sanitize, Options) of 82 | {ok, Fun} -> 83 | Fun; 84 | error -> 85 | fun (_, V) -> V end 86 | end, 87 | 88 | try chronicle_log:read_log(Path, 89 | fun dump_log_header/2, 90 | fun (Entry, State) -> 91 | dump_log_entry(SanitizeFun, Entry, State) 92 | end, 93 | header) of 94 | {ok, _} -> 95 | ok; 96 | {error, Error} -> 97 | set_error(), 98 | ?error("Error while dumping '~s': ~w", [Path, Error]) 99 | catch 100 | T:E:Stacktrace -> 101 | set_error(), 102 | ?error("Unexpected exception: ~p:~p. Stacktrace:~n" 103 | "~p", 104 | [T, E, 105 | chronicle_utils:sanitize_stacktrace(Stacktrace)]) 106 | end. 107 | 108 | dump_log_header(Header, header) -> 109 | ?fmt("Header:"), 110 | dump_term(indent(), Header), 111 | first. 112 | 113 | dump_log_entry(SanitizeFun, Entry, State) -> 114 | case State of 115 | first -> 116 | ?fmt("~nEntries:"); 117 | rest -> 118 | ok 119 | end, 120 | 121 | dump_term(indent(), unpack_entry(SanitizeFun, Entry)), 122 | rest. 123 | 124 | unpack_entry(SanitizeFun, Entry) -> 125 | chronicle_storage:map_append( 126 | fun (#log_entry{value = Value} = LogEntry) -> 127 | case Value of 128 | #rsm_command{rsm_name = Name} -> 129 | Sanitized = chronicle_rsm:unpack_payload( 130 | fun (Payload) -> 131 | SanitizeFun(Name, Payload) 132 | end, Value), 133 | 134 | LogEntry#log_entry{value = Sanitized}; 135 | _ -> 136 | LogEntry 137 | end 138 | end, Entry). 139 | 140 | sanitize_opt(Value) -> function_opt(Value, 2). 141 | decrypt_opt(Value) -> function_opt(Value, 1). 142 | 143 | function_opt(Value, Arity) -> 144 | case string:split(Value, ":", all) of 145 | [Module, Function] -> 146 | M = list_to_atom(Module), 147 | F = list_to_atom(Function), 148 | case chronicle_utils:is_function_exported(M, F, Arity) of 149 | true -> 150 | {ok, fun M:F/Arity}; 151 | false -> 152 | {error, not_exported} 153 | end; 154 | _ -> 155 | {error, invalid_function} 156 | end. 157 | 158 | output_item(Binary) when is_binary(Binary) -> Binary; 159 | output_item(Atom) when is_atom(Atom) -> atom_to_binary(Atom, latin1); 160 | output_item(Int) when is_integer(Int) -> integer_to_list(Int); 161 | output_item(String) when is_list(String) -> String. 162 | 163 | dump_guts(Args) -> 164 | {Path, Options} = parse_args(Args, 165 | #{decrypt => {option, fun decrypt_opt/1}}), 166 | DecryptFun = maps:get(decrypt, Options, fun (D) -> {ok, D} end), 167 | chronicle_env:setup_decrypt_function(DecryptFun), 168 | Guts = dump_guts_inner(Path), 169 | Items = [E || {K, V} <- Guts, E <- [K, V]], 170 | ?fmt("~s", [[[output_item(Item) | <<0:8>>] || Item <- Items]]). 171 | 172 | dump_guts_inner(Path) -> 173 | case chronicle_storage:read_rsm_snapshot(Path) of 174 | {ok, Snapshot} -> 175 | try 176 | Props0 = chronicle_rsm:format_snapshot(Snapshot), 177 | Props1 = proplists:get_value("RSM state", Props0), 178 | {?RAW_TAG, Props} = Props1, 179 | 180 | CompatVer = get_value(cluster_compat_version, Props), 181 | BucketNames = get_value(bucket_names, Props), 182 | 183 | [{cluster_compat_version, 184 | iolist_to_binary(io_lib:format("~p", [CompatVer]))}, 185 | {bucket_names, string:join(BucketNames, ",")}, 186 | {rebalance_type, get_value(rebalance_type, Props)}] 187 | catch 188 | T:E:Stacktrace -> 189 | ?error("Unexpected exception: ~p:~p. Stacktrace:~n" 190 | "~p", 191 | [T, E, 192 | chronicle_utils:sanitize_stacktrace(Stacktrace)]) 193 | end; 194 | {error, Error} -> 195 | ?error("Couldn't read snapshot '~s': ~w", [Path, Error]) 196 | end. 197 | 198 | get_value(Key, Props) -> 199 | case proplists:get_value(Key, Props) of 200 | {Value, _ChronicleMeta} -> 201 | Value; 202 | undefined -> 203 | undefined 204 | end. 205 | 206 | dump_snapshots(Args) -> 207 | {Paths, Options} = parse_args(Args, 208 | #{raw => flag, 209 | sanitize => {option, fun sanitize_opt/1}, 210 | decrypt => {option, fun decrypt_opt/1}}), 211 | DecryptFun = maps:get(decrypt, Options, fun (D) -> {ok, D} end), 212 | chronicle_env:setup_decrypt_function(DecryptFun), 213 | dump_many(Paths, 214 | fun (Path) -> 215 | dump_snapshot(Path, Options) 216 | end). 217 | 218 | dump_many([], _) -> 219 | ok; 220 | dump_many([Path], Fun) -> 221 | Fun(Path); 222 | dump_many([Path|Rest], Fun) -> 223 | Fun(Path), 224 | ?fmt("~n"), 225 | dump_many(Rest, Fun). 226 | 227 | sanitize_snapshot(Snapshot, Options) -> 228 | case maps:find(sanitize, Options) of 229 | {ok, Fun} -> 230 | chronicle_rsm:map_snapshot(Fun, Snapshot); 231 | error -> 232 | Snapshot 233 | end. 234 | 235 | dump_snapshot(Path, Options) -> 236 | case chronicle_storage:read_rsm_snapshot(Path) of 237 | {ok, RawSnapshot} -> 238 | ?fmt("Dumping '~s'~n", [Path]), 239 | 240 | try 241 | Snapshot = sanitize_snapshot(RawSnapshot, Options), 242 | 243 | case maps:get(raw, Options, false) of 244 | true -> 245 | dump_term(Snapshot); 246 | false -> 247 | Props = chronicle_rsm:format_snapshot(Snapshot), 248 | dump_props(Props) 249 | end 250 | catch 251 | T:E:Stacktrace -> 252 | set_error(), 253 | ?error("Unexpected exception: ~p:~p. Stacktrace:~n" 254 | "~p", 255 | [T, E, 256 | chronicle_utils:sanitize_stacktrace(Stacktrace)]) 257 | end; 258 | {error, Error} -> 259 | set_error(), 260 | ?error("Couldn't read snapshot '~s': ~w", [Path, Error]) 261 | end. 262 | 263 | dump_props(Props) -> 264 | dump_props("", Props). 265 | 266 | dump_props(Indent, Props) when is_list(Props) -> 267 | lists:foreach( 268 | fun (Elem) -> 269 | dump_elem(Indent, Elem) 270 | end, Props). 271 | 272 | dump_elem(Indent, Elem) -> 273 | case Elem of 274 | {_, _} = Pair -> 275 | dump_pair(Indent, Pair); 276 | _ -> 277 | case type(Elem) of 278 | {string, String} -> 279 | dump_string(Indent, String); 280 | {_, Term} -> 281 | dump_term(Indent, Term) 282 | end 283 | end. 284 | 285 | dump_string(Indent, String) -> 286 | ?fmt("~s~s", [Indent, String]). 287 | 288 | dump_term(Term) -> 289 | dump_term("", Term). 290 | 291 | dump_term(Indent, Term) -> 292 | ?fmt("~s~250p", [Indent, Term]). 293 | 294 | dump_pair(Indent, {Name0, Value0} = Pair) -> 295 | case type(Name0) of 296 | {string, Name} -> 297 | case type(Value0) of 298 | {string, Value} -> 299 | ?fmt("~s~s: ~s", [Indent, Name, Value]); 300 | {term, Value} -> 301 | case large(Value) of 302 | true -> 303 | ?fmt("~s~s:", [Indent, Name]), 304 | dump_term(indent(Indent), Value); 305 | false -> 306 | ?fmt("~s~s: ~250p", [Indent, Name, Value]) 307 | end; 308 | {list, Value} -> 309 | ?fmt("~s~s:", [Indent, Name]), 310 | dump_props(indent(Indent), Value) 311 | end; 312 | _ -> 313 | dump_term(Indent, Pair) 314 | end. 315 | 316 | indent() -> 317 | indent(""). 318 | 319 | indent(Indent) -> 320 | " " ++ Indent. 321 | 322 | large(Term) -> 323 | erts_debug:flat_size(Term) > 100. 324 | 325 | stringlike(Term) -> 326 | io_lib:printable_list(Term) orelse is_binary(Term) orelse is_atom(Term). 327 | 328 | type({?RAW_TAG, Term}) -> 329 | {term, Term}; 330 | type(Term) -> 331 | case stringlike(Term) of 332 | true -> 333 | {string, Term}; 334 | false -> 335 | case is_list(Term) of 336 | true -> 337 | {list, Term}; 338 | false -> 339 | {term, Term} 340 | end 341 | end. 342 | 343 | -spec usage() -> no_return(). 344 | usage() -> 345 | ?error("Usage: ~s [COMMAND]", [escript:script_name()]), 346 | ?error("Commands:"), 347 | ?error(" snapshot [--raw] [--sanitize ] [--decrypt ] [FILE]..."), 348 | ?error(" dumpguts [--decrypt ] [FILE]"), 349 | ?error(" log [--sanitize ] [--decrypt ] [FILE]..."), 350 | stop(?STATUS_FATAL). 351 | 352 | -spec usage(Fmt::io:format(), Args::[any()]) -> no_return(). 353 | usage(Fmt, Args) -> 354 | ?error("~s: " ++ Fmt, [escript:script_name() | Args]), 355 | usage(). 356 | 357 | -spec stop(non_neg_integer()) -> no_return(). 358 | stop(Status) -> 359 | catch logger_std_h:filesync(default), 360 | erlang:halt(Status). 361 | 362 | status() -> 363 | case erlang:get(status) of 364 | undefined -> 365 | ?STATUS_OK; 366 | Status -> 367 | Status 368 | end. 369 | 370 | set_status(Status) -> 371 | case status() of 372 | ?STATUS_OK -> 373 | erlang:put(status, Status); 374 | _ -> 375 | ok 376 | end. 377 | 378 | set_error() -> 379 | set_status(?STATUS_ERROR). 380 | 381 | setup_logger() -> 382 | ok = chronicle_env:setup_logger(), 383 | ok = logger:remove_handler(default), 384 | 385 | FormatterConfig = #{single_line => true, 386 | template => [level, ": ", msg, "\n"]}, 387 | Formatter = {logger_formatter, FormatterConfig}, 388 | ok = logger:add_handler(default, logger_std_h, 389 | #{config => #{type => standard_error}, 390 | formatter => Formatter}). 391 | 392 | -spec main(list()) -> no_return(). 393 | main(Args) -> 394 | persistent_term:put(?CHRONICLE_LOAD_NIFS, false), 395 | setup_logger(), 396 | 397 | case Args of 398 | [Command | RestArgs] -> 399 | case Command of 400 | "snapshot" -> 401 | dump_snapshots(RestArgs); 402 | "log" -> 403 | dump_logs(RestArgs); 404 | "dumpguts" -> 405 | dump_guts(RestArgs); 406 | _ -> 407 | usage("unknown command '~s'", [Command]) 408 | end; 409 | _ -> 410 | usage() 411 | end, 412 | 413 | stop(status()). 414 | -------------------------------------------------------------------------------- /scripts/test_leader_remove_addback.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o pipefail 4 | set -o errexit 5 | 6 | num_nodes=5 7 | 8 | get_leader() { 9 | curl --fail -s -XPOST -H "Content-Type: application/json" \ 10 | 127.0.0.1:8080/node/status | \ 11 | jq -r .leader.node 12 | } 13 | 14 | wait_leader() { 15 | local leader 16 | 17 | while true; do 18 | leader=$(get_leader) 19 | if [ "$leader" = null ]; then 20 | sleep 0.1s 21 | continue 22 | fi 23 | 24 | echo "$leader" | sed 's/^chronicle_\(.\)@127.0.0.1$/\1/' 25 | break 26 | done 27 | } 28 | 29 | eject() { 30 | local port=$(( 8080 + $1 )) 31 | curl --fail -s -XPOST -H "Content-Type: applycation/json" \ 32 | 127.0.0.1:$port/config/removenode \ 33 | -d "\"chronicle_$2@127.0.0.1\"" > /dev/null 34 | } 35 | 36 | wipe() { 37 | local port=$(( 8080 + $1 )) 38 | curl --fail -s -XPOST -H "Content-Type: applycation/json" \ 39 | 127.0.0.1:$port/node/wipe > /dev/null 40 | } 41 | 42 | addback() { 43 | local port=$(( 8080 + $1 )) 44 | curl --fail -s -XPOST -H "Content-Type: applycation/json" \ 45 | 127.0.0.1:$port/config/addnode \ 46 | -d "\"chronicle_$2@127.0.0.1\"" > /dev/null 47 | } 48 | 49 | get_pid() { 50 | pid=$(pgrep -f "beam.smp.*chronicle_$1@") 51 | if [ -z pid ]; then 52 | false 53 | else 54 | echo "$pid" 55 | fi 56 | } 57 | 58 | while true; do 59 | leader=$(wait_leader) 60 | echo "Node $leader is the leader" 61 | 62 | other_node=$(( (leader + 1) % $num_nodes )) 63 | echo "Helper node is $other_node" 64 | 65 | pid=$(get_pid $leader) 66 | echo "Node $leader pid is $pid" 67 | 68 | kill -SIGSTOP "$pid" 69 | echo "Stopped node $leader" 70 | 71 | eject $other_node $leader 72 | echo "Node $leader ejected" 73 | 74 | kill -SIGCONT "$pid" 75 | echo "Node $leader resumed" 76 | 77 | wipe $leader 78 | echo "Node $leader wiped" 79 | 80 | addback $other_node $leader 81 | echo "Node $leader added back" 82 | echo "=========================" 83 | done 84 | -------------------------------------------------------------------------------- /src/chronicle.app.src: -------------------------------------------------------------------------------- 1 | {application,chronicle, 2 | [{description,"Chronicle"}, 3 | {vsn,"0.0.1"}, 4 | {licenses,["Apache"]}, 5 | {links,[{"github","https://github.com/couchbase/chronicle"}]}, 6 | {modules,[]}, 7 | {registered,[]}, 8 | {applications,[kernel,stdlib]}, 9 | {mod,{chronicle_app,[]}}]}. 10 | -------------------------------------------------------------------------------- /src/chronicle.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2020 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | -module(chronicle). 17 | 18 | -include("chronicle.hrl"). 19 | 20 | -ifdef(TEST). 21 | -include_lib("eunit/include/eunit.hrl"). 22 | -endif. 23 | 24 | -export([get_system_state/0]). 25 | -export([force_snapshot/0, export_snapshot/1]). 26 | -export([check_quorum/0, check_quorum/1]). 27 | -export([get_peer_statuses/0, get_cluster_status/0]). 28 | -export([provision/1, reprovision/0, wipe/0]). 29 | -export([get_cluster_info/0, get_cluster_info/1]). 30 | -export([prepare_join/1, join_cluster/1]). 31 | -export([failover/1, failover/2, try_cancel_failover/2]). 32 | -export([acquire_lock/0, acquire_lock/1]). 33 | -export([get_peers/0, get_peers/1, 34 | get_voters/0, get_voters/1, get_replicas/0, get_replicas/1]). 35 | -export([add_voter/1, add_voter/2, add_voters/1, add_voters/2, 36 | add_replica/1, add_replica/2, add_replicas/1, add_replicas/2, 37 | add_peer/2, add_peer/3, add_peer/4, 38 | add_peers/1, add_peers/2, add_peers/3, 39 | remove_peer/1, remove_peer/2, remove_peers/1, remove_peers/2]). 40 | -export([set_peer_role/2, set_peer_role/3, set_peer_role/4, 41 | set_peer_roles/1, set_peer_roles/2, set_peer_roles/3]). 42 | -export([switch_compat_version/0, switch_compat_version/1]). 43 | -export([compare_revisions/2]). 44 | -export([put_rsm/1]). 45 | 46 | %% For internal use only currently. Changing these may render chronicle 47 | %% unusable. 48 | -export([set_setting/2, set_setting/3, 49 | unset_setting/1, unset_setting/2, 50 | replace_settings/1, replace_settings/2]). 51 | 52 | -export_type([uuid/0, peer/0, peer_id/0, peers/0, peers_and_roles/0, 53 | history_id/0, history_log/0, compat_version/0, 54 | leader_term/0, seqno/0, peer_position/0, 55 | revision/0, 56 | serial/0, incarnation/0, 57 | cluster_info/0, 58 | lock/0, lockreq/0, role/0]). 59 | 60 | -define(DEFAULT_TIMEOUT, 15000). 61 | 62 | -type uuid() :: binary(). 63 | -type peer() :: atom(). 64 | -type peer_id() :: uuid(). 65 | -type peers() :: [peer()]. 66 | -type peers_and_roles() :: [{peer(), role()}]. 67 | -type history_id() :: binary(). 68 | -type history_log() :: [{history_id(), seqno()}]. 69 | -type compat_version() :: non_neg_integer(). 70 | 71 | -type leader_term() :: {non_neg_integer(), peer()}. 72 | -type seqno() :: non_neg_integer(). 73 | -type peer_position() :: {TermVoted :: leader_term(), HighSeqno :: seqno()}. 74 | -type revision() :: {history_id(), seqno()}. 75 | 76 | -type serial() :: non_neg_integer(). 77 | -type incarnation() :: non_neg_integer(). 78 | 79 | -type cluster_info() :: #{history_id := history_id(), 80 | committed_seqno := seqno(), 81 | peers := peers()}. 82 | 83 | -type lock() :: binary(). 84 | -type lockreq() :: lock() | unlocked. 85 | -type role() :: voter | replica. 86 | -type rsm() :: {Name :: atom(), Mod :: module(), Args :: [any()]}. 87 | 88 | -spec get_system_state() -> 89 | not_provisioned | 90 | joining_cluster | 91 | provisioned | 92 | removed. 93 | get_system_state() -> 94 | case chronicle_agent:get_system_state() of 95 | not_provisioned -> 96 | not_provisioned; 97 | {State, _Extra} 98 | when State =:= joining_cluster; 99 | State =:= provisioned; 100 | State =:= removed -> 101 | State 102 | end. 103 | 104 | -spec force_snapshot() -> chronicle_agent:force_snapshot_result(). 105 | force_snapshot() -> 106 | chronicle_agent:force_snapshot(). 107 | 108 | -spec export_snapshot(Dir::file:name()) -> 109 | chronicle_agent:export_snapshot_result(). 110 | export_snapshot(Path) -> 111 | chronicle_agent:export_snapshot(Path). 112 | 113 | -type check_quorum_result() :: true 114 | | {false, timeout | no_leader}. 115 | 116 | -spec check_quorum() -> check_quorum_result(). 117 | check_quorum() -> 118 | check_quorum(?DEFAULT_TIMEOUT). 119 | 120 | -spec check_quorum(timeout()) -> check_quorum_result(). 121 | check_quorum(Timeout) -> 122 | chronicle_config_rsm:check_quorum(Timeout). 123 | 124 | -spec get_peer_statuses() -> chronicle_status:peer_statuses(). 125 | get_peer_statuses() -> 126 | chronicle_status:get_peers(). 127 | 128 | -spec get_cluster_status() -> chronicle_status:cluster_status(). 129 | get_cluster_status() -> 130 | chronicle_status:get_cluster_status(). 131 | 132 | -spec provision([rsm()]) -> chronicle_agent:provision_result(). 133 | provision(Machines) -> 134 | chronicle_agent:provision(Machines). 135 | 136 | -spec reprovision() -> chronicle_agent:reprovision_result(). 137 | reprovision() -> 138 | chronicle_agent:reprovision(). 139 | 140 | -spec wipe() -> chronicle_agent:wipe_result(). 141 | wipe() -> 142 | chronicle_agent:wipe(). 143 | 144 | -type acquire_lock_result() :: {ok, lock()}. 145 | -spec acquire_lock() -> acquire_lock_result(). 146 | acquire_lock() -> 147 | acquire_lock(?DEFAULT_TIMEOUT). 148 | 149 | -spec acquire_lock(timeout()) -> acquire_lock_result(). 150 | acquire_lock(Timeout) -> 151 | chronicle_config_rsm:acquire_lock(Timeout). 152 | 153 | -type no_voters_left_error() :: no_voters_left. 154 | -type lock_revoked_error() :: 155 | {lock_revoked, ExpectedLock::lock(), ActualLock::lock()}. 156 | 157 | -type remove_peers_result() :: ok | {error, remove_peers_error()}. 158 | -type remove_peers_error() :: no_voters_left_error() 159 | | lock_revoked_error(). 160 | 161 | -spec remove_peer(peer()) -> remove_peers_result(). 162 | remove_peer(Peer) -> 163 | remove_peer(unlocked, Peer). 164 | 165 | -spec remove_peer(lockreq(), peer()) -> remove_peers_result(). 166 | remove_peer(Lock, Peer) -> 167 | remove_peer(Lock, Peer, ?DEFAULT_TIMEOUT). 168 | 169 | -spec remove_peer(lockreq(), peer(), timeout()) -> remove_peers_result(). 170 | remove_peer(Lock, Peer, Timeout) -> 171 | remove_peers(Lock, [Peer], Timeout). 172 | 173 | -spec remove_peers(peers()) -> remove_peers_result(). 174 | remove_peers(Peers) -> 175 | remove_peers(unlocked, Peers). 176 | 177 | -spec remove_peers(lockreq(), peers()) -> remove_peers_result(). 178 | remove_peers(Lock, Peers) -> 179 | remove_peers(Lock, Peers, ?DEFAULT_TIMEOUT). 180 | 181 | -spec remove_peers(lockreq(), peers(), timeout()) -> remove_peers_result(). 182 | remove_peers(Lock, Peers, Timeout) -> 183 | chronicle_config_rsm:remove_peers(Lock, Peers, Timeout). 184 | 185 | -spec add_voter(peer()) -> add_peers_result(). 186 | add_voter(Peer) -> 187 | add_voter(unlocked, Peer). 188 | 189 | -spec add_voter(lockreq(), peer()) -> add_peers_result(). 190 | add_voter(Lock, Peer) -> 191 | add_voter(Lock, Peer, ?DEFAULT_TIMEOUT). 192 | 193 | -spec add_voter(lockreq(), peer(), timeout()) -> add_peers_result(). 194 | add_voter(Lock, Peer, Timeout) -> 195 | add_voters(Lock, [Peer], Timeout). 196 | 197 | -spec add_voters(peers()) -> add_peers_result(). 198 | add_voters(Peers) -> 199 | add_voters(unlocked, Peers). 200 | 201 | -spec add_voters(lockreq(), peers()) -> add_peers_result(). 202 | add_voters(Lock, Peers) -> 203 | add_voters(Lock, Peers, ?DEFAULT_TIMEOUT). 204 | 205 | -spec add_voters(lockreq(), peers(), timeout()) -> add_peers_result(). 206 | add_voters(Lock, Peers, Timeout) -> 207 | add_peers(Lock, [{Peer, voter} || Peer <- Peers], Timeout). 208 | 209 | -spec add_replica(peer()) -> add_peers_result(). 210 | add_replica(Peer) -> 211 | add_replica(unlocked, Peer). 212 | 213 | -spec add_replica(lockreq(), peer()) -> add_peers_result(). 214 | add_replica(Lock, Peer) -> 215 | add_replica(Lock, Peer, ?DEFAULT_TIMEOUT). 216 | 217 | -spec add_replica(lockreq(), peer(), timeout()) -> add_peers_result(). 218 | add_replica(Lock, Peer, Timeout) -> 219 | add_replicas(Lock, [Peer], Timeout). 220 | 221 | -spec add_replicas(peers()) -> add_peers_result(). 222 | add_replicas(Peers) -> 223 | add_replicas(unlocked, Peers). 224 | 225 | -spec add_replicas(lockreq(), peers()) -> add_peers_result(). 226 | add_replicas(Lock, Peers) -> 227 | add_replicas(Lock, Peers, ?DEFAULT_TIMEOUT). 228 | 229 | -spec add_replicas(lockreq(), peers(), timeout()) -> add_peers_result(). 230 | add_replicas(Lock, Peers, Timeout) -> 231 | add_peers(Lock, [{Peer, replica} || Peer <- Peers], Timeout). 232 | 233 | -type add_peers_result() :: ok | {error, add_peers_error()}. 234 | -type add_peers_error() :: lock_revoked_error() 235 | | {already_member, peer(), role()}. 236 | 237 | -spec add_peer(peer(), role()) -> add_peers_result(). 238 | add_peer(Peer, Role) -> 239 | add_peer(unlocked, Peer, Role). 240 | 241 | -spec add_peer(lockreq(), peer(), role()) -> add_peers_result(). 242 | add_peer(Lock, Peer, Role) -> 243 | add_peer(Lock, Peer, Role, ?DEFAULT_TIMEOUT). 244 | 245 | -spec add_peer(lockreq(), peer(), role(), timeout()) -> add_peers_result(). 246 | add_peer(Lock, Peer, Role, Timeout) -> 247 | add_peers(Lock, [{Peer, Role}], Timeout). 248 | 249 | -spec add_peers(peers_and_roles()) -> add_peers_result(). 250 | add_peers(Peers) -> 251 | add_peers(unlocked, Peers). 252 | 253 | -spec add_peers(lockreq(), peers_and_roles()) -> add_peers_result(). 254 | add_peers(Lock, Peers) -> 255 | add_peers(Lock, Peers, ?DEFAULT_TIMEOUT). 256 | 257 | -spec add_peers(lockreq(), peers_and_roles(), timeout()) -> add_peers_result(). 258 | add_peers(Lock, Peers, Timeout) -> 259 | chronicle_config_rsm:add_peers(Lock, Peers, Timeout). 260 | 261 | -type set_peer_roles_result() :: ok | {error, set_peer_roles_error()}. 262 | -type set_peer_roles_error() :: lock_revoked_error() 263 | | no_voters_left_error() 264 | | {not_member, peer()}. 265 | 266 | -spec set_peer_role(peer(), role()) -> set_peer_roles_result(). 267 | set_peer_role(Peer, Role) -> 268 | set_peer_role(unlocked, Peer, Role). 269 | 270 | -spec set_peer_role(lockreq(), peer(), role()) -> set_peer_roles_result(). 271 | set_peer_role(Lock, Peer, Role) -> 272 | set_peer_role(Lock, Peer, Role, ?DEFAULT_TIMEOUT). 273 | 274 | -spec set_peer_role(lockreq(), peer(), role(), timeout()) -> 275 | set_peer_roles_result(). 276 | set_peer_role(Lock, Peer, Role, Timeout) -> 277 | set_peer_roles(Lock, [{Peer, Role}], Timeout). 278 | 279 | -spec set_peer_roles(peers_and_roles()) -> set_peer_roles_result(). 280 | set_peer_roles(Peers) -> 281 | set_peer_roles(unlocked, Peers). 282 | 283 | -spec set_peer_roles(lockreq(), peers_and_roles()) -> set_peer_roles_result(). 284 | set_peer_roles(Lock, Peers) -> 285 | set_peer_roles(Lock, Peers, ?DEFAULT_TIMEOUT). 286 | 287 | -spec set_peer_roles(lockreq(), peers_and_roles(), timeout()) -> 288 | set_peer_roles_result(). 289 | set_peer_roles(Lock, Peers, Timeout) -> 290 | chronicle_config_rsm:set_peer_roles(Lock, Peers, Timeout). 291 | 292 | -type get_peers_result() :: #{voters := peers(), replicas := peers()}. 293 | 294 | -spec get_peers() -> get_peers_result(). 295 | get_peers() -> 296 | get_peers(?DEFAULT_TIMEOUT). 297 | 298 | -spec get_peers(timeout()) -> get_peers_result(). 299 | get_peers(Timeout) -> 300 | chronicle_config_rsm:get_peers(Timeout). 301 | 302 | -spec get_voters() -> peers(). 303 | get_voters() -> 304 | get_voters(?DEFAULT_TIMEOUT). 305 | 306 | -spec get_voters(timeout()) -> peers(). 307 | get_voters(Timeout) -> 308 | #{voters := Voters} = get_peers(Timeout), 309 | Voters. 310 | 311 | -spec get_replicas() -> peers(). 312 | get_replicas() -> 313 | get_replicas(?DEFAULT_TIMEOUT). 314 | 315 | -spec get_replicas(timeout()) -> peers(). 316 | get_replicas(Timeout) -> 317 | #{replicas := Replicas} = get_peers(Timeout), 318 | Replicas. 319 | 320 | -spec get_cluster_info() -> cluster_info(). 321 | get_cluster_info() -> 322 | get_cluster_info(?DEFAULT_TIMEOUT). 323 | 324 | -spec get_cluster_info(timeout()) -> cluster_info(). 325 | get_cluster_info(Timeout) -> 326 | chronicle_config_rsm:get_cluster_info(Timeout). 327 | 328 | -spec prepare_join(cluster_info()) -> chronicle_agent:prepare_join_result(). 329 | prepare_join(ClusterInfo) -> 330 | chronicle_agent:prepare_join(ClusterInfo). 331 | 332 | -spec join_cluster(cluster_info()) -> chronicle_agent:join_cluster_result(). 333 | join_cluster(ClusterInfo) -> 334 | chronicle_agent:join_cluster(ClusterInfo). 335 | 336 | -spec failover(peers()) -> chronicle_failover:failover_result(). 337 | failover(KeepPeers) -> 338 | chronicle_failover:failover(KeepPeers). 339 | 340 | -spec failover(peers(), Opaque::any()) -> chronicle_failover:failover_result(). 341 | failover(KeepPeers, Opaque) -> 342 | chronicle_failover:failover(KeepPeers, Opaque). 343 | 344 | -spec try_cancel_failover(history_id(), peers()) -> 345 | chronicle_failover:try_cancel_result(). 346 | try_cancel_failover(Id, Peers) -> 347 | chronicle_failover:try_cancel(Id, Peers). 348 | 349 | -spec set_setting(term(), term()) -> ok. 350 | set_setting(Name, Value) -> 351 | set_setting(Name, Value, ?DEFAULT_TIMEOUT). 352 | 353 | -spec set_setting(term(), term(), timeout()) -> ok. 354 | set_setting(Name, Value, Timeout) -> 355 | chronicle_config_rsm:set_settings(#{Name => Value}, Timeout). 356 | 357 | -spec unset_setting(term()) -> ok. 358 | unset_setting(Name) -> 359 | unset_setting(Name, ?DEFAULT_TIMEOUT). 360 | 361 | -spec unset_setting(term(), timeout()) -> ok. 362 | unset_setting(Name, Timeout) -> 363 | chronicle_config_rsm:unset_settings([Name], Timeout). 364 | 365 | -spec replace_settings(map()) -> ok. 366 | replace_settings(Settings) -> 367 | replace_settings(Settings, ?DEFAULT_TIMEOUT). 368 | 369 | -spec replace_settings(map(), timeout()) -> ok. 370 | replace_settings(Settings, Timeout) -> 371 | chronicle_config_rsm:replace_settings(Settings, Timeout). 372 | 373 | -type switch_compat_version_result() :: 374 | {ok, 375 | OldVersion::chronicle:compat_version(), 376 | NewVersion::chronicle:compat_version()} | 377 | {error, switch_compat_version_error()}. 378 | -type switch_compat_version_error() :: 379 | lock_revoked_error() | 380 | {get_peer_infos_failed, 381 | #{chronicle:peer() => Error::any()}}. 382 | 383 | -spec switch_compat_version() -> switch_compat_version_result(). 384 | switch_compat_version() -> 385 | switch_compat_version(unlocked). 386 | 387 | -spec switch_compat_version(lockreq()) -> switch_compat_version_result(). 388 | switch_compat_version(Lock) -> 389 | #{voters := Voters, replicas := Replicas} = get_peers(), 390 | Peers = Voters ++ Replicas, 391 | case chronicle_agent:get_peer_infos(Peers) of 392 | {ok, Infos0} -> 393 | Infos = maps:values(Infos0), 394 | Versions = [maps:get(supported_compat_version, Info) || 395 | Info <- Infos], 396 | 397 | SupportedVersion = lists:min(Versions), 398 | chronicle_config_rsm:set_compat_version(Lock, SupportedVersion, 399 | ?DEFAULT_TIMEOUT); 400 | {error, Failed} -> 401 | {error, {get_peer_infos_failed, Failed}} 402 | end. 403 | 404 | -spec put_rsm(rsm()) -> ok. 405 | put_rsm(Machine) -> 406 | chronicle_config_rsm:put_rsm(Machine, ?DEFAULT_TIMEOUT). 407 | 408 | -spec compare_revisions(revision(), revision()) -> eq | gt | lt | incompatible. 409 | compare_revisions(RevA, RevB) -> 410 | {AUUID, ASeqno} = RevA, 411 | {BUUID, BSeqno} = RevB, 412 | 413 | case AUUID =:= BUUID of 414 | true -> 415 | if 416 | ASeqno > BSeqno -> 417 | gt; 418 | BSeqno > ASeqno -> 419 | lt; 420 | true -> 421 | eq 422 | end; 423 | false -> 424 | incompatible 425 | end. 426 | 427 | -ifdef(TEST). 428 | compare_revisions_test() -> 429 | eq = compare_revisions({<<"a">>, 42}, {<<"a">>, 42}), 430 | lt = compare_revisions({<<"a">>, 22}, {<<"a">>, 42}), 431 | gt = compare_revisions({<<"a">>, 42}, {<<"a">>, 22}), 432 | 433 | incompatible = compare_revisions({<<"a">>, 42}, {<<"b">>, 42}), 434 | incompatible = compare_revisions({<<"a">>, 22}, {<<"b">>, 42}), 435 | incompatible = compare_revisions({<<"a">>, 42}, {<<"b">>, 22}), 436 | 437 | incompatible = compare_revisions({<<"b">>, 42}, {<<"a">>, 42}), 438 | incompatible = compare_revisions({<<"b">>, 22}, {<<"a">>, 42}), 439 | incompatible = compare_revisions({<<"b">>, 42}, {<<"a">>, 22}). 440 | -endif. 441 | -------------------------------------------------------------------------------- /src/chronicle.hrl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2020 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | 17 | -ifdef(TEST). 18 | -define(PEER(), vnet:vnode()). 19 | -define(START_NAME(Name), {via, vnet, Name}). 20 | -define(SERVER_NAME(Name), {via, vnet, Name}). 21 | -define(SERVER_NAME(Peer, Name), {via, vnet, {Peer, Name}}). 22 | -define(ETS_TABLE(Name), list_to_atom("ets-" 23 | ++ atom_to_list(vnet:vnode()) 24 | ++ "-" 25 | ++ atom_to_list(Name))). 26 | -else. 27 | -define(PEER(), node()). 28 | -define(START_NAME(Name), {local, Name}). 29 | -define(SERVER_NAME(Name), Name). 30 | -define(SERVER_NAME(Peer, Name), {Name, Peer}). 31 | -define(ETS_TABLE(Name), Name). 32 | -endif. 33 | 34 | -define(NO_PEER, 'nonode@nohost'). 35 | -define(NO_PEER_ID, <<>>). 36 | -define(NO_HISTORY, <<"no-history">>). 37 | -define(NO_TERM, {0, ?NO_PEER}). 38 | -define(NO_SEQNO, 0). 39 | 40 | -define(COMPAT_VERSION, 0). 41 | 42 | -record(rsm_config, { module :: module(), 43 | args = [] :: list() }). 44 | 45 | -record(rsm_command, 46 | { rsm_name :: atom(), 47 | peer_id :: chronicle:peer_id(), 48 | peer_incarnation :: chronicle:incarnation(), 49 | serial :: undefined | chronicle:serial(), 50 | seen_serial :: chronicle:serial(), 51 | payload :: noop | {command, any()} }). 52 | 53 | -record(branch, {history_id, 54 | old_history_id, 55 | coordinator, 56 | peers, 57 | 58 | opaque :: any()}). 59 | 60 | -record(config, { request_id :: any(), 61 | 62 | compat_version :: chronicle:compat_version(), 63 | lock :: undefined | binary(), 64 | peers :: chronicle_config:peers(), 65 | old_peers :: undefined | chronicle_config:peers(), 66 | state_machines :: #{atom() => #rsm_config{}}, 67 | 68 | settings = #{} :: map(), 69 | 70 | branch :: undefined | #branch{}, 71 | history_log :: chronicle:history_log() }). 72 | 73 | -record(log_entry, 74 | { history_id :: chronicle:history_id(), 75 | term :: chronicle:leader_term(), 76 | seqno :: chronicle:seqno(), 77 | value :: noop | #config{} | #rsm_command{}}). 78 | 79 | -record(metadata, { peer, 80 | peer_id, 81 | history_id, 82 | term, 83 | high_term, 84 | high_seqno, 85 | committed_seqno, 86 | config, 87 | committed_config, 88 | pending_branch }). 89 | 90 | -define(DEBUG(Fmt, Args), ?LOG(debug, Fmt, Args)). 91 | -define(INFO(Fmt, Args), ?LOG(info, Fmt, Args)). 92 | -define(WARNING(Fmt, Args), ?LOG(warning, Fmt, Args)). 93 | -define(ERROR(Fmt, Args), ?LOG(error, Fmt, Args)). 94 | 95 | -define(DEBUG(Msg), ?DEBUG(Msg, [])). 96 | -define(INFO(Msg), ?INFO(Msg, [])). 97 | -define(WARNING(Msg), ?WARNING(Msg, [])). 98 | -define(ERROR(Msg), ?ERROR(Msg, [])). 99 | 100 | -define(CHRONICLE_LOAD_NIFS, '$chronicle_load_nifs'). 101 | -define(WHEN_LOAD_NIFS(Body), 102 | %% It's impossible to load *.so files from an escript archive without 103 | %% lots of extra hoops. Currently I don't need this nif for 104 | %% chronicle_dump, so escripts will simply not even attempt to load 105 | %% it. 106 | case persistent_term:get(?CHRONICLE_LOAD_NIFS, true) of 107 | true -> 108 | Body; 109 | false -> 110 | ok 111 | end). 112 | 113 | -define(CHRONICLE_LOGGER, '$chronicle_logger'). 114 | -define(CHRONICLE_STATS, '$chronicle_stats'). 115 | -define(CHRONICLE_ENCRYPT, '$chronicle_encrypt'). 116 | -define(CHRONICLE_DECRYPT, '$chronicle_decrypt'). 117 | 118 | -define(LOG(Level, Fmt, Args), 119 | (persistent_term:get(?CHRONICLE_LOGGER))( 120 | Level, Fmt, Args, 121 | #{file => ?FILE, 122 | line => ?LINE, 123 | module => ?MODULE, 124 | function => ?FUNCTION_NAME, 125 | arity => ?FUNCTION_ARITY})). 126 | 127 | -define(ENCRYPT(Data), (persistent_term:get(?CHRONICLE_ENCRYPT))(Data)). 128 | -define(DECRYPT(Data), (persistent_term:get(?CHRONICLE_DECRYPT))(Data)). 129 | 130 | -define(CHECK(Cond1, Cond2), 131 | case Cond1 of 132 | ok -> 133 | Cond2; 134 | __Error -> 135 | __Error 136 | end). 137 | -define(CHECK(Cond1, Cond2, Cond3), 138 | ?CHECK(Cond1, ?CHECK(Cond2, Cond3))). 139 | -define(CHECK(Cond1, Cond2, Cond3, Cond4), 140 | ?CHECK(Cond1, ?CHECK(Cond2, Cond3, Cond4))). 141 | -define(CHECK(Cond1, Cond2, Cond3, Cond4, Cond5), 142 | ?CHECK(Cond1, ?CHECK(Cond2, Cond3, Cond4, Cond5))). 143 | 144 | -define(FLUSH(Pattern), 145 | (fun __Loop(__N) -> 146 | receive 147 | Pattern -> 148 | __Loop(__N + 1) 149 | after 150 | 0 -> 151 | __N 152 | end 153 | end)(0)). 154 | 155 | -define(CRC_BITS, 32). 156 | -define(CRC_BYTES, (?CRC_BITS bsr 3)). 157 | 158 | -define(META_STATE_PROVISIONED, provisioned). 159 | -define(META_STATE_NOT_PROVISIONED, not_provisioned). 160 | -define(META_STATE_PREPARE_JOIN, prepare_join). 161 | -define(META_STATE_JOIN_CLUSTER, join_cluster). 162 | -define(META_STATE_REMOVED, removed). 163 | 164 | -define(META_STATE, state). 165 | -define(META_PEER, peer). 166 | -define(META_PEER_ID, peer_id). 167 | -define(META_HISTORY_ID, history_id). 168 | -define(META_TERM, term). 169 | -define(META_COMMITTED_SEQNO, committed_seqno). 170 | -define(META_PENDING_BRANCH, pending_branch). 171 | 172 | -define(RSM_EVENTS, chronicle_rsm_events). 173 | 174 | -define(EXTERNAL_EVENTS, chronicle_external_events). 175 | -define(EXTERNAL_EVENTS_SERVER, ?SERVER_NAME(?EXTERNAL_EVENTS)). 176 | -------------------------------------------------------------------------------- /src/chronicle_agent_sup.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2021 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | -module(chronicle_agent_sup). 17 | 18 | -behavior(supervisor). 19 | 20 | -include("chronicle.hrl"). 21 | 22 | -export([start_link/0]). 23 | -export([init/1]). 24 | 25 | start_link() -> 26 | supervisor:start_link(?START_NAME(?MODULE), ?MODULE, []). 27 | 28 | %% callbacks 29 | init([]) -> 30 | Flags = #{strategy => one_for_all, 31 | %% Make sure that everything following chronicle_agent_sup in 32 | %% the top-level supervisor restarts if any of the processes 33 | %% here crash. 34 | intensity => 0, 35 | period => 10}, 36 | {ok, {Flags, child_specs()}}. 37 | 38 | child_specs() -> 39 | SnapshotMgr = #{id => chronicle_snapshot_mgr, 40 | start => {chronicle_snapshot_mgr, start_link, []}, 41 | restart => permanent, 42 | shutdown => brutal_kill, 43 | type => worker}, 44 | 45 | RSMEvents = #{id => ?RSM_EVENTS, 46 | start => {chronicle_events, start_link, [?RSM_EVENTS]}, 47 | restart => permanent, 48 | shutdown => 1000, 49 | type => worker}, 50 | 51 | Agent = #{id => chronicle_agent, 52 | start => {chronicle_agent, start_link, []}, 53 | restart => permanent, 54 | shutdown => 5000, 55 | type => worker}, 56 | 57 | [SnapshotMgr, RSMEvents, Agent]. 58 | -------------------------------------------------------------------------------- /src/chronicle_app.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2020 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | -module(chronicle_app). 17 | 18 | -export([start/2, stop/1]). 19 | 20 | start(_Type, _Args) -> 21 | case chronicle_env:setup() of 22 | ok -> 23 | chronicle_sup:start_link(); 24 | {error, _} = Error -> 25 | Error 26 | end. 27 | 28 | stop(_State) -> 29 | ok. 30 | -------------------------------------------------------------------------------- /src/chronicle_catchup.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2020 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | -module(chronicle_catchup). 17 | 18 | -behavior(gen_server). 19 | 20 | -export([start_link/3]). 21 | -export([catchup_peer/4, cancel_catchup/2, stop/1]). 22 | 23 | -export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2]). 24 | 25 | -include("chronicle.hrl"). 26 | 27 | -import(chronicle_utils, [sanitize_stacktrace/1]). 28 | 29 | -define(MAX_PARALLEL_CATCHUPS, 30 | chronicle_settings:get({catchup, max_parallel_catchups}, 4)). 31 | 32 | -record(state, { parent, 33 | peer, 34 | history_id, 35 | term, 36 | pids, 37 | pending }). 38 | 39 | start_link(Self, HistoryId, Term) -> 40 | gen_server:start_link(?START_NAME(?MODULE), ?MODULE, 41 | [self(), Self, HistoryId, Term], []). 42 | 43 | catchup_peer(Pid, Opaque, Peer, PeerSeqno) -> 44 | gen_server:cast(Pid, {catchup_peer, Opaque, Peer, PeerSeqno}). 45 | 46 | cancel_catchup(Pid, Peer) -> 47 | gen_server:cast(Pid, {cancel_catchup, Peer}). 48 | 49 | stop(Pid) -> 50 | ok = gen_server:call(Pid, stop, 10000), 51 | ok = chronicle_utils:wait_for_process(Pid, 1000). 52 | 53 | %% callbacks 54 | init([Parent, Self, HistoryId, Term]) -> 55 | {ok, #state{peer = Self, 56 | parent = Parent, 57 | history_id = HistoryId, 58 | term = Term, 59 | pids = #{}, 60 | pending = queue:new()}}. 61 | 62 | handle_call(stop, _From, State) -> 63 | {stop, normal, ok, State}; 64 | handle_call(_Call, _From, State) -> 65 | {reply, nack, State}. 66 | 67 | handle_cast({catchup_peer, Opaque, Peer, PeerSeqno}, State) -> 68 | handle_catchup_peer(Peer, PeerSeqno, Opaque, State); 69 | handle_cast({cancel_catchup, Peer}, State) -> 70 | handle_cancel_catchup(Peer, State); 71 | handle_cast(Cast, State) -> 72 | {stop, {unexpected_cast, Cast}, State}. 73 | 74 | handle_info({catchup_result, Pid, Result}, State) -> 75 | handle_catchup_result(Pid, Result, State); 76 | handle_info(Msg, State) -> 77 | ?WARNING("Unexpected message ~p", [Msg]), 78 | {noreply, State}. 79 | 80 | terminate(_Reason, State) -> 81 | terminate_children(State). 82 | 83 | %% internal 84 | handle_catchup_peer(Peer, PeerSeqno, Opaque, 85 | #state{peer = Self, pending = Pending} = State) -> 86 | %% We should never catchup ourselves 87 | true = (Peer =/= Self), 88 | 89 | NewPending = queue:in({Peer, PeerSeqno, Opaque}, Pending), 90 | NewState0 = State#state{pending = NewPending}, 91 | {Spawned, NewState} = maybe_spawn_pending(NewState0), 92 | 93 | case Spawned of 94 | true -> 95 | ok; 96 | false -> 97 | ?INFO("Catchup for peer ~w at seqno ~b " 98 | "delayed due to other catchups still in progress.", 99 | [Peer, PeerSeqno]) 100 | end, 101 | 102 | {noreply, NewState}. 103 | 104 | handle_cancel_catchup(Peer, State) -> 105 | NewState0 = cancel_pending(Peer, State), 106 | NewState1 = cancel_active(Peer, NewState0), 107 | {_, NewState} = maybe_spawn_pending(NewState1), 108 | {noreply, NewState}. 109 | 110 | handle_catchup_result(Pid, Result, #state{pids = Pids} = State) -> 111 | {{_, Opaque}, NewPids} = maps:take(Pid, Pids), 112 | reply_to_parent(Opaque, Result, State), 113 | {_, NewState} = maybe_spawn_pending(State#state{pids = NewPids}), 114 | {noreply, NewState}. 115 | 116 | reply_to_parent(Opaque, Reply, #state{parent = Parent}) -> 117 | Parent ! {Opaque, Reply}, 118 | ok. 119 | 120 | terminate_children(#state{pids = Pids}) -> 121 | lists:foreach( 122 | fun (Pid) -> 123 | terminate_child(Pid) 124 | end, maps:keys(Pids)). 125 | 126 | terminate_child(Pid) -> 127 | chronicle_utils:terminate_linked_process(Pid, kill), 128 | ?FLUSH({catchup_result, Pid, _}). 129 | 130 | maybe_spawn_pending(State) -> 131 | maybe_spawn_pending(false, State). 132 | 133 | maybe_spawn_pending(Spawned, #state{pids = Pids, 134 | pending = Pending} = State) -> 135 | case maps:size(Pids) < ?MAX_PARALLEL_CATCHUPS of 136 | true -> 137 | case queue:out(Pending) of 138 | {empty, _} -> 139 | {Spawned, State}; 140 | {{value, {Peer, PeerSeqno, Opaque}}, NewPending} -> 141 | NewState = State#state{pending = NewPending}, 142 | maybe_spawn_pending( 143 | true, 144 | spawn_catchup(Peer, PeerSeqno, Opaque, NewState)) 145 | end; 146 | false -> 147 | {Spawned, State} 148 | end. 149 | 150 | spawn_catchup(Peer, PeerSeqno, Opaque, #state{pids = Pids} = State) -> 151 | ?DEBUG("Starting catchup for peer ~w at seqno ~b", [Peer, PeerSeqno]), 152 | 153 | true = (maps:size(Pids) < ?MAX_PARALLEL_CATCHUPS), 154 | Parent = self(), 155 | Pid = proc_lib:spawn_link( 156 | fun () -> 157 | Result = 158 | try do_catchup(Peer, PeerSeqno, State) of 159 | R -> 160 | R 161 | catch 162 | T:E:Stacktrace -> 163 | ?ERROR("Catchup to peer ~p failed: ~p~n" 164 | "Stacktrace:~n~p", 165 | [Peer, {T, E}, 166 | sanitize_stacktrace(Stacktrace)]), 167 | {error, {catchup_failed, {T, E}}} 168 | end, 169 | 170 | Parent ! {catchup_result, self(), Result} 171 | end), 172 | State#state{pids = Pids#{Pid => {Peer, Opaque}}}. 173 | 174 | do_catchup(Peer, PeerSeqno, State) -> 175 | case get_full_snapshot(PeerSeqno) of 176 | no_snapshot -> 177 | case chronicle_agent:get_term_for_seqno(PeerSeqno) of 178 | {ok, AtTerm} -> 179 | send_entries(Peer, AtTerm, PeerSeqno, State); 180 | {error, compacted} -> 181 | %% Have the proposer retry. 182 | {compacted, PeerSeqno} 183 | end; 184 | Snapshot -> 185 | case install_snapshot(Peer, Snapshot, State) of 186 | {ok, _} -> 187 | {SnapshotSeqno, _, SnapshotTerm, _, _} = Snapshot, 188 | send_entries(Peer, SnapshotTerm, SnapshotSeqno, State); 189 | {error, _} = Error -> 190 | Error 191 | end 192 | end. 193 | 194 | get_full_snapshot(PeerSeqno) -> 195 | case chronicle_agent:get_full_snapshot(PeerSeqno) of 196 | {ok, Seqno, HistoryId, Term, Config, RSMSnapshots} -> 197 | {Seqno, HistoryId, Term, Config, RSMSnapshots}; 198 | {error, no_snapshot} -> 199 | no_snapshot 200 | end. 201 | 202 | install_snapshot(Peer, 203 | {SnapshotSeqno, 204 | SnapshotHistoryId, SnapshotTerm, 205 | SnapshotConfig, RSMSnapshots}, 206 | #state{peer = Self, history_id = HistoryId, term = Term}) -> 207 | ServerRef = chronicle_agent:server_ref(Peer, Self), 208 | chronicle_agent:install_snapshot(ServerRef, HistoryId, Term, 209 | SnapshotSeqno, 210 | SnapshotHistoryId, SnapshotTerm, 211 | SnapshotConfig, RSMSnapshots). 212 | 213 | send_entries(Peer, AtTerm, AtSeqno, State) -> 214 | case chronicle_agent:get_log_committed(AtSeqno + 1) of 215 | {ok, _, []} -> 216 | {ok, AtSeqno}; 217 | {ok, CommittedSeqno, Entries} -> 218 | append(Peer, CommittedSeqno, AtTerm, AtSeqno, Entries, State); 219 | {error, compacted} -> 220 | %% Have the proposer retry. 221 | {compacted, AtSeqno} 222 | end. 223 | 224 | append(Peer, CommittedSeqno, AtTerm, AtSeqno, Entries, 225 | #state{peer = Self, history_id = HistoryId, term = Term}) -> 226 | ServerRef = chronicle_agent:server_ref(Peer, Self), 227 | case chronicle_agent:append(ServerRef, HistoryId, Term, 228 | CommittedSeqno, AtTerm, AtSeqno, Entries) of 229 | ok -> 230 | {ok, CommittedSeqno}; 231 | {error, _} = Error -> 232 | Error 233 | end. 234 | 235 | cancel_active(Peer, #state{pids = Pids} = State) -> 236 | NewPids = 237 | maps:filter( 238 | fun (Pid, {OtherPeer, _Opaque}) -> 239 | case Peer =:= OtherPeer of 240 | true -> 241 | terminate_child(Pid), 242 | false; 243 | false -> 244 | true 245 | end 246 | end, Pids), 247 | 248 | State#state{pids = NewPids}. 249 | 250 | cancel_pending(Peer, #state{pending = Pending} = State) -> 251 | NewPending = 252 | queue:filter( 253 | fun ({OtherPeer, _, _}) -> 254 | OtherPeer =/= Peer 255 | end, Pending), 256 | State#state{pending = NewPending}. 257 | -------------------------------------------------------------------------------- /src/chronicle_config.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2020 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | -module(chronicle_config). 17 | 18 | -include("chronicle.hrl"). 19 | 20 | -define(MAX_HISTORY_LOG_SIZE, 10). 21 | 22 | -ifdef(TEST). 23 | -include_lib("eunit/include/eunit.hrl"). 24 | -endif. 25 | 26 | -export([format_config/1]). 27 | -export([is_config/1, is_stable/1]). 28 | -export([transition/2, next_config/1]). 29 | -export([init/3, reinit/3, put_rsm/2, branch/3]). 30 | -export([get_request_id/1, set_request_id/2]). 31 | -export([get_compat_version/1, set_compat_version/2]). 32 | -export([set_lock/2, check_lock/2]). 33 | -export([get_rsms/1, get_quorum/1]). 34 | -export([get_peers/1, get_replicas/1, get_voters/1]). 35 | -export([add_peers/2, remove_peers/2, set_peer_roles/2]). 36 | -export([get_peer_ids/1]). 37 | -export([get_peer_id/2, is_peer/3]). 38 | -export([get_branch_opaque/1]). 39 | -export([get_settings/1, set_settings/2]). 40 | -export([is_compatible_revision/3]). 41 | 42 | -export_type([peers/0]). 43 | 44 | -type peers() :: #{chronicle:peer() => 45 | #{id := chronicle:peer_id(), role := chronicle:role()}}. 46 | 47 | format_config(#config{request_id = Id, 48 | compat_version = Version, 49 | lock = Lock, 50 | peers = Peers, 51 | old_peers = OldPeers, 52 | state_machines = StateMachines, 53 | settings = Settings, 54 | branch = Branch, 55 | history_log = HistoryLog}) -> 56 | [{"Request id", Id}, 57 | {"Compat version", Version}, 58 | {"Lock", Lock}, 59 | {"Peers", format_peers(Peers)}, 60 | {"Old peers", format_peers(OldPeers)}, 61 | {"State machines", format_state_machines(StateMachines)}, 62 | {"Branch", format_branch(Branch)}, 63 | {"History log", HistoryLog}, 64 | {"Settings", Settings}]. 65 | 66 | format_peers(undefined) -> 67 | undefined; 68 | format_peers(Peers) -> 69 | lists:map( 70 | fun ({Peer, #{id := PeerId, role := Role}}) -> 71 | {Peer, [{"Id", PeerId}, {"Role", Role}]} 72 | end, maps:to_list(Peers)). 73 | 74 | format_state_machines(StateMachines) -> 75 | lists:map( 76 | fun ({Name, #rsm_config{module = Module, args = Args}}) -> 77 | {Name, [{"Module", Module}, 78 | {"Args", chronicle_dump:raw(Args)}]} 79 | end, maps:to_list(StateMachines)). 80 | 81 | format_branch(undefined) -> 82 | undefined; 83 | format_branch(#branch{history_id = HistoryId, 84 | old_history_id = OldHistoryId, 85 | coordinator = Coordinator, 86 | peers = Peers, 87 | opaque = Opaque}) -> 88 | [{"History id", HistoryId}, 89 | {"Previous history id", OldHistoryId}, 90 | {"Coordinator", Coordinator}, 91 | {"Peers", Peers}, 92 | {"Opaque", Opaque}]. 93 | 94 | is_config(Value) -> 95 | case Value of 96 | #config{} -> 97 | true; 98 | _ -> 99 | false 100 | end. 101 | 102 | is_stable(#config{old_peers = OldPeers}) -> 103 | OldPeers =:= undefined. 104 | 105 | next_config(#config{old_peers = OldPeers} = Config) -> 106 | case OldPeers =:= undefined of 107 | true -> 108 | Config; 109 | false -> 110 | Config#config{old_peers = undefined} 111 | end. 112 | 113 | transition(NewConfig, OldConfig) -> 114 | case needs_transition(NewConfig, OldConfig) of 115 | true -> 116 | NewConfig#config{old_peers = OldConfig#config.peers}; 117 | false -> 118 | NewConfig 119 | end. 120 | 121 | init(HistoryId, Peer, Machines0) -> 122 | Machines = [{chronicle_config_rsm, chronicle_config_rsm, []} | Machines0], 123 | MachinesMap = 124 | maps:from_list( 125 | [{Name, #rsm_config{module = Module, args = Args}} || 126 | {Name, Module, Args} <- Machines]), 127 | Peers = #{Peer => peer_info(voter)}, 128 | #config{compat_version = ?COMPAT_VERSION, 129 | peers = Peers, 130 | state_machines = MachinesMap, 131 | history_log = add_history(HistoryId, ?NO_SEQNO, [])}. 132 | 133 | reinit(NewPeer, OldPeer, #config{old_peers = undefined} = Config) -> 134 | {ok, PeerId} = get_peer_id(OldPeer, Config), 135 | reset(Config#config{peers = #{NewPeer => peer_info(PeerId, voter)}}). 136 | 137 | put_rsm({Name, Module, Args}, #config{state_machines = MachinesMap} = Config) -> 138 | NewMap = maps:put(Name, #rsm_config{module = Module, args = Args}, 139 | MachinesMap), 140 | {ok, reset(Config#config{state_machines = NewMap})}. 141 | 142 | branch(Seqno, #branch{peers = Peers, 143 | history_id = HistoryId} = Branch, Config0) -> 144 | Config = reset(Config0), 145 | #config{history_log = HistoryLog} = Config, 146 | NewHistoryLog = add_history(HistoryId, Seqno, HistoryLog), 147 | %% TODO: figure out what to do with replicas 148 | PeerInfos = [{Peer, peer_info(Peer, Config, voter)} || Peer <- Peers], 149 | Config#config{peers = maps:from_list(PeerInfos), 150 | old_peers = undefined, 151 | branch = Branch, 152 | history_log = NewHistoryLog}. 153 | 154 | add_history(HistoryId, Seqno, []) -> 155 | [{HistoryId, Seqno}]; 156 | add_history(HistoryId, Seqno, [{_, PrevSeqno} | _] = HistoryLog) -> 157 | true = Seqno > PrevSeqno, 158 | NewHistoryLog = [{HistoryId, Seqno} | HistoryLog], 159 | lists:sublist(NewHistoryLog, ?MAX_HISTORY_LOG_SIZE). 160 | 161 | get_compat_version(#config{compat_version = Version}) -> 162 | Version. 163 | 164 | set_compat_version(Version, Config) -> 165 | reset(Config#config{compat_version = Version}). 166 | 167 | set_lock(Lock, #config{} = Config) -> 168 | reset(Config#config{lock = Lock}). 169 | 170 | check_lock(LockReq, #config{lock = Lock}) -> 171 | case LockReq =:= unlocked orelse LockReq =:= Lock of 172 | true -> 173 | ok; 174 | false -> 175 | {error, {lock_revoked, LockReq, Lock}} 176 | end. 177 | 178 | get_request_id(#config{request_id = RequestId})-> 179 | RequestId. 180 | 181 | set_request_id(RequestId, #config{} = Config) -> 182 | Config#config{request_id = RequestId}. 183 | 184 | reset_request_id(Config) -> 185 | set_request_id(undefined, Config). 186 | 187 | get_rsms(#config{state_machines = RSMs}) -> 188 | RSMs. 189 | 190 | get_quorum(#config{peers = Peers, old_peers = OldPeers}) -> 191 | case OldPeers =:= undefined of 192 | true -> 193 | peers_quorum(Peers); 194 | false -> 195 | {joint, peers_quorum(Peers), peers_quorum(OldPeers)} 196 | end. 197 | 198 | get_peers(#config{peers = Peers, old_peers = OldPeers}) -> 199 | case OldPeers =:= undefined of 200 | true -> 201 | lists:sort(maps:keys(Peers)); 202 | false -> 203 | lists:usort(maps:keys(Peers) ++ maps:keys(OldPeers)) 204 | end. 205 | 206 | get_peer_ids(#config{peers = Peers, old_peers = OldPeers}) -> 207 | case OldPeers =:= undefined of 208 | true -> 209 | get_peer_ids(#{}, Peers); 210 | false -> 211 | get_peer_ids(OldPeers, Peers) 212 | end. 213 | 214 | get_peer_ids(OldPeers, Peers) -> 215 | Ids = maps:fold( 216 | fun (_, #{id := Id}, Acc) -> 217 | [Id | Acc] 218 | end, [], maps:merge(OldPeers, Peers)), 219 | lists:usort(Ids). 220 | 221 | get_replicas(#config{peers = Peers}) -> 222 | lists:sort(peers_replicas(Peers)). 223 | 224 | get_voters(#config{peers = Peers}) -> 225 | lists:sort(peers_voters(Peers)). 226 | 227 | add_peers(Peers, Config) -> 228 | update_peers( 229 | fun (CurrentPeers) -> 230 | add_peers_loop(Peers, CurrentPeers) 231 | end, Config). 232 | 233 | add_peers_loop([], AccPeers) -> 234 | {ok, AccPeers}; 235 | add_peers_loop([{Peer, Role} | Rest], AccPeers) -> 236 | case maps:find(Peer, AccPeers) of 237 | {ok, #{role := CurrentRole}} -> 238 | {error, {already_member, Peer, CurrentRole}}; 239 | error -> 240 | add_peers_loop(Rest, AccPeers#{Peer => peer_info(Role)}) 241 | end. 242 | 243 | remove_peers(Peers, Config) -> 244 | update_peers( 245 | fun (CurrentPeers) -> 246 | {ok, maps:without(Peers, CurrentPeers)} 247 | end, Config). 248 | 249 | set_peer_roles(Peers, Config) -> 250 | update_peers( 251 | fun (CurrentPeers) -> 252 | set_peer_roles_loop(Peers, CurrentPeers) 253 | end, Config). 254 | 255 | set_peer_roles_loop([], AccPeers) -> 256 | {ok, AccPeers}; 257 | set_peer_roles_loop([{Peer, Role} | Rest], AccPeers) -> 258 | case maps:find(Peer, AccPeers) of 259 | {ok, Info} -> 260 | NewInfo = Info#{role => Role}, 261 | set_peer_roles_loop(Rest, AccPeers#{Peer => NewInfo}); 262 | error -> 263 | {error, {not_member, Peer}} 264 | end. 265 | 266 | update_peers(Fun, #config{peers = OldPeers} = Config) -> 267 | case Fun(OldPeers) of 268 | {ok, NewPeers} -> 269 | NewVoters = peers_voters(NewPeers), 270 | case NewVoters of 271 | [] -> 272 | {error, no_voters_left}; 273 | _ -> 274 | {ok, reset(Config#config{peers = NewPeers})} 275 | end; 276 | {error, _} = Error -> 277 | Error 278 | end. 279 | 280 | peers_of_role(Role, Peers) -> 281 | maps:keys(maps:filter( 282 | fun (_, #{role := PeerRole}) -> 283 | PeerRole =:= Role 284 | end, Peers)). 285 | 286 | peers_voters(Peers) -> 287 | peers_of_role(voter, Peers). 288 | 289 | peers_replicas(Peers) -> 290 | peers_of_role(replica, Peers). 291 | 292 | peers_quorum(Peers) -> 293 | Voters = peers_voters(Peers), 294 | {majority, sets:from_list(Voters)}. 295 | 296 | needs_transition(NewConfig, OldConfig) -> 297 | NewVoters = get_voters(NewConfig), 298 | OldVoters = get_voters(OldConfig), 299 | do_needs_transition(NewVoters, OldVoters). 300 | 301 | do_needs_transition(NewVoters, OldVoters) -> 302 | Added = NewVoters -- OldVoters, 303 | Removed = OldVoters -- NewVoters, 304 | NumChanges = length(Added) + length(Removed), 305 | 306 | %% If there's no more than one change, then all quorums in the new config 307 | %% interesect all quorums in the old config. So we don't need to go 308 | %% through a transitional configuration. 309 | NumChanges > 1. 310 | 311 | -ifdef(TEST). 312 | needs_transition_test() -> 313 | ?assertEqual(false, do_needs_transition([a, b, c], [a, b, c, d])), 314 | ?assertEqual(false, do_needs_transition([a, b, c], [a, b])), 315 | ?assertEqual(false, do_needs_transition([a, b, c], [c, a, d, b])), 316 | ?assertEqual(true, do_needs_transition([a, b, c], [a, b, c, d, e])), 317 | ?assertEqual(true, do_needs_transition([a, b, c], [a, b, d])), 318 | ?assertEqual(true, do_needs_transition([a, b, c], [c, a, e, d, b])). 319 | -endif. 320 | 321 | peer_info(Role) -> 322 | peer_info(chronicle_utils:random_uuid(), Role). 323 | 324 | peer_info(Peer, Config, Role) -> 325 | case get_peer_id(Peer, Config) of 326 | {ok, PeerId} -> 327 | peer_info(PeerId, Role); 328 | not_peer -> 329 | exit({unknown_peer, Peer, Config}) 330 | end. 331 | 332 | peer_info(Id, Role) -> 333 | #{id => Id, role => Role}. 334 | 335 | get_peer_id(Peer, #config{peers = Peers, old_peers = OldPeers}) -> 336 | MaybePeerInfo = 337 | case maps:find(Peer, Peers) of 338 | {ok, _} = Ok -> 339 | Ok; 340 | error -> 341 | case OldPeers =:= undefined of 342 | true -> 343 | error; 344 | false -> 345 | maps:find(Peer, OldPeers) 346 | end 347 | end, 348 | 349 | case MaybePeerInfo of 350 | error -> 351 | not_peer; 352 | {ok, #{id := PeerId}} -> 353 | {ok, PeerId} 354 | end. 355 | 356 | is_peer(Peer, PeerId, Config) -> 357 | case get_peer_id(Peer, Config) of 358 | {ok, FoundPeerId} -> 359 | FoundPeerId =:= PeerId; 360 | not_peer -> 361 | false 362 | end. 363 | 364 | get_branch_opaque(#config{branch = Branch}) -> 365 | case Branch of 366 | undefined -> 367 | no_branch; 368 | #branch{opaque = Opaque}-> 369 | {ok, Opaque} 370 | end. 371 | 372 | get_settings(#config{settings = Settings}) -> 373 | Settings. 374 | 375 | set_settings(NewSettings, Config) -> 376 | reset(Config#config{settings = NewSettings}). 377 | 378 | reset_branch(#config{branch = Branch} = Config) -> 379 | case Branch of 380 | undefined -> 381 | Config; 382 | _ -> 383 | Config#config{branch = undefined} 384 | end. 385 | 386 | reset(Config) -> 387 | reset_request_id(reset_branch(Config)). 388 | 389 | is_compatible_revision({RevHistoryId, RevSeqno} = Revision, HighSeqno, 390 | #config{history_log = HistoryLog}) -> 391 | case is_compatible_revision(RevHistoryId, 392 | RevSeqno, HighSeqno, HistoryLog) of 393 | true -> 394 | true; 395 | false -> 396 | {false, {Revision, HighSeqno, HistoryLog}} 397 | end. 398 | 399 | is_compatible_revision(RevHistoryId, RevSeqno, HighSeqno, HistoryLog) -> 400 | case RevSeqno > HighSeqno of 401 | true -> 402 | [{CurrentHistoryId, _} | RestHistoryLog] = HistoryLog, 403 | case CurrentHistoryId =:= RevHistoryId of 404 | true -> 405 | %% Most common case. 406 | true; 407 | false -> 408 | %% If RevHistoryId is one of "sealed" history ids, then 409 | %% the revision is incompatible. 410 | not lists:keymember(RevHistoryId, 1, RestHistoryLog) 411 | end; 412 | false -> 413 | MatchingLog = 414 | lists:dropwhile( 415 | fun ({_, LogSeqno}) -> 416 | LogSeqno > RevSeqno 417 | end, HistoryLog), 418 | 419 | case MatchingLog of 420 | [] -> 421 | false; 422 | [{LogHistoryId, _}|_] -> 423 | LogHistoryId =:= RevHistoryId 424 | end 425 | end. 426 | 427 | -ifdef(TEST). 428 | is_compatible_revision_test() -> 429 | HistoryLog = [{b, 15}, {a, 10}], 430 | HighSeqno = 20, 431 | 432 | true = is_compatible_revision(b, 25, HighSeqno, HistoryLog), 433 | true = is_compatible_revision(c, 25, HighSeqno, HistoryLog), 434 | false = is_compatible_revision(a, 25, HighSeqno, HistoryLog), 435 | true = is_compatible_revision(b, 17, HighSeqno, HistoryLog), 436 | true = is_compatible_revision(b, 15, HighSeqno, HistoryLog), 437 | false = is_compatible_revision(b, 14, HighSeqno, HistoryLog), 438 | false = is_compatible_revision(a, 15, HighSeqno, HistoryLog), 439 | true = is_compatible_revision(a, 13, HighSeqno, HistoryLog), 440 | true = is_compatible_revision(a, 10, HighSeqno, HistoryLog), 441 | false = is_compatible_revision(a, 9, HighSeqno, HistoryLog), 442 | false = is_compatible_revision(b, 9, HighSeqno, HistoryLog), 443 | false = is_compatible_revision(c, 9, HighSeqno, HistoryLog). 444 | -endif. 445 | -------------------------------------------------------------------------------- /src/chronicle_config_rsm.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2021 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | -module(chronicle_config_rsm). 17 | 18 | -include("chronicle.hrl"). 19 | 20 | -export([get_config/1, get_cluster_info/1, get_peers/1, check_quorum/1]). 21 | -export([acquire_lock/1, remove_peers/3, add_peers/3, set_peer_roles/3, 22 | set_settings/2, replace_settings/2, unset_settings/2, 23 | set_compat_version/3, put_rsm/2]). 24 | 25 | -export([format_state/1]). 26 | -export([specs/2, init/2, post_init/3, state_enter/4, 27 | handle_config/5, apply_snapshot/5, handle_query/4, 28 | apply_command/6, handle_info/4, terminate/4]). 29 | 30 | -define(NAME, ?MODULE). 31 | 32 | -record(state, { current_request, 33 | pending_requests, 34 | 35 | config }). 36 | -record(data, { is_leader = false }). 37 | 38 | get_config(Timeout) -> 39 | query(get_config, Timeout). 40 | 41 | get_cluster_info(Timeout) -> 42 | query(get_cluster_info, Timeout). 43 | 44 | get_peers(Timeout) -> 45 | query(get_peers, Timeout). 46 | 47 | query(Query, Timeout) -> 48 | TRef = chronicle_utils:start_timeout(Timeout), 49 | ok = chronicle_rsm:sync(?NAME, TRef), 50 | chronicle_rsm:query(?NAME, Query, TRef). 51 | 52 | check_quorum(Timeout) -> 53 | try chronicle_rsm:sync(?NAME, Timeout) of 54 | ok -> 55 | true 56 | catch 57 | exit:timeout -> 58 | {false, timeout} 59 | end. 60 | 61 | acquire_lock(Timeout) -> 62 | command(acquire_lock, Timeout). 63 | 64 | remove_peers(Lock, Peers, Timeout) -> 65 | command({remove_peers, Lock, Peers}, Timeout). 66 | 67 | add_peers(Lock, Peers, Timeout) -> 68 | command({add_peers, Lock, Peers}, Timeout). 69 | 70 | set_peer_roles(Lock, Peers, Timeout) -> 71 | command({set_peer_roles, Lock, Peers}, Timeout). 72 | 73 | set_settings(Settings, Timeout) -> 74 | command({set_settings, Settings}, Timeout). 75 | 76 | replace_settings(Settings, Timeout) -> 77 | command({replace_settings, Settings}, Timeout). 78 | 79 | unset_settings(Names, Timeout) -> 80 | command({unset_settings, Names}, Timeout). 81 | 82 | set_compat_version(Lock, NewVersion, Timeout) -> 83 | command({set_compat_version, Lock, NewVersion}, Timeout). 84 | 85 | put_rsm(Machine, Timeout) -> 86 | command({put_rsm, Machine}, Timeout). 87 | 88 | %% callbacks 89 | format_state(#state{current_request = CurrentRequest, 90 | pending_requests = PendingRequests, 91 | config = ConfigEntry}) -> 92 | #log_entry{history_id = HistoryId, 93 | term = Term, 94 | seqno = Seqno, 95 | value = Config} = ConfigEntry, 96 | 97 | [{"Current request", CurrentRequest}, 98 | {"Pending requests", PendingRequests}, 99 | {"Config", 100 | [{"History id", HistoryId}, 101 | {"Term", Term}, 102 | {"Seqno", Seqno}, 103 | {"Value", chronicle_config:format_config(Config)}]}]. 104 | 105 | specs(_Name, _Args) -> 106 | []. 107 | 108 | init(?NAME, []) -> 109 | {ok, #state{current_request = undefined, pending_requests = []}, #data{}}. 110 | 111 | post_init(_, _, Data) -> 112 | {ok, Data}. 113 | 114 | state_enter(leader, _Revision, State, Data) -> 115 | NewData = Data#data{is_leader = true}, 116 | maybe_submit_current_request(State, NewData), 117 | {ok, NewData}; 118 | state_enter(_, _Revision, _State, Data) -> 119 | {ok, Data#data{is_leader = false}}. 120 | 121 | handle_config(ConfigEntry, _Revision, _StateRevision, 122 | #state{current_request = CurrentRequest, 123 | pending_requests = PendingRequests} = State, Data) -> 124 | #log_entry{value = Config} = ConfigEntry, 125 | ConfigRequestId = chronicle_config:get_request_id(Config), 126 | NewState0 = State#state{current_request = undefined, 127 | config = ConfigEntry}, 128 | 129 | case CurrentRequest of 130 | undefined -> 131 | {ok, NewState0, Data, []}; 132 | {Id, _, Reply, _} when Id =:= ConfigRequestId -> 133 | loop_pending_requests([{Id, Reply}], NewState0, Data); 134 | {Id, Request, _, _} -> 135 | NewState1 = NewState0#state{ 136 | pending_requests = [{Id, Request} | PendingRequests]}, 137 | loop_pending_requests([], NewState1, Data) 138 | end. 139 | 140 | loop_pending_requests(Replies, 141 | #state{pending_requests = Requests, 142 | config = ConfigEntry} = State, Data) -> 143 | #log_entry{value = Config} = ConfigEntry, 144 | {SubmitRequest, FinalReplies, NewPendingRequests} = 145 | do_loop_pending_requests(Replies, Requests, Config), 146 | 147 | NewState = State#state{pending_requests = NewPendingRequests}, 148 | FinalState = 149 | case SubmitRequest of 150 | no_request -> 151 | NewState; 152 | {Id, _Request, _Reply, NewConfig} -> 153 | maybe_submit_config(Id, NewConfig, NewState, Data), 154 | NewState#state{current_request = SubmitRequest} 155 | end, 156 | 157 | {ok, FinalState, Data, FinalReplies}. 158 | 159 | do_loop_pending_requests(Replies, [], _) -> 160 | {no_request, Replies, []}; 161 | do_loop_pending_requests(Replies, [{Id, Request} | Requests], Config) -> 162 | case do_apply_command(Request, Config) of 163 | {accept, Reply, NewConfig} -> 164 | {{Id, Request, Reply, NewConfig}, Replies, Requests}; 165 | {reject, Reply} -> 166 | NewReplies = [{Id, Reply} | Replies], 167 | do_loop_pending_requests(NewReplies, Requests, Config) 168 | end. 169 | 170 | apply_snapshot(_Revision, State, _OldRevision, _OldState, Data) -> 171 | maybe_submit_current_request(State, Data), 172 | {ok, Data}. 173 | 174 | handle_query(get_config, _, #state{config = ConfigEntry}, Data) -> 175 | Config = ConfigEntry#log_entry.value, 176 | Revision = chronicle_utils:log_entry_revision(ConfigEntry), 177 | {reply, {ok, Config, Revision}, Data}; 178 | handle_query(get_peers, _, #state{config = ConfigEntry}, Data) -> 179 | Config = ConfigEntry#log_entry.value, 180 | Voters = chronicle_config:get_voters(Config), 181 | Replicas = chronicle_config:get_replicas(Config), 182 | {reply, #{voters => Voters, replicas => Replicas}, Data}; 183 | handle_query(get_cluster_info, {HistoryId, Seqno}, State, Data) -> 184 | #state{config = ConfigEntry} = State, 185 | #log_entry{value = Config} = ConfigEntry, 186 | Info = #{history_id => HistoryId, 187 | compat_version => chronicle_config:get_compat_version(Config), 188 | committed_seqno => Seqno, 189 | config => ConfigEntry}, 190 | {reply, Info, Data}; 191 | handle_query(_, _, _, Data) -> 192 | {reply, {error, unknown_query}, Data}. 193 | 194 | apply_command(Id, Request, _, _, 195 | #state{current_request = CurrentRequest, 196 | pending_requests = PendingRequests, 197 | config = ConfigEntry} = State, Data) -> 198 | case CurrentRequest =:= undefined of 199 | true -> 200 | case do_apply_command(Request, ConfigEntry#log_entry.value) of 201 | {accept, Reply, NewConfig} -> 202 | NewState = 203 | State#state{ 204 | current_request = {Id, Request, Reply, NewConfig}}, 205 | maybe_submit_config(Id, NewConfig, State, Data), 206 | {noreply, NewState, Data}; 207 | {reject, Reply} -> 208 | {reply, Reply, State, Data} 209 | end; 210 | false -> 211 | NewPendingRequests = [{Id, Request} | PendingRequests], 212 | NewState = State#state{pending_requests = NewPendingRequests}, 213 | {noreply, NewState, Data} 214 | end. 215 | 216 | do_apply_command(acquire_lock, Config) -> 217 | Lock = chronicle_utils:random_uuid(), 218 | NewConfig = chronicle_config:set_lock(Lock, Config), 219 | {accept, {ok, Lock}, NewConfig}; 220 | do_apply_command({remove_peers, Lock, Peers}, Config) -> 221 | update(fun handle_remove_peers/3, [Lock, Peers, Config]); 222 | do_apply_command({add_peers, Lock, Peers}, Config) -> 223 | update(fun handle_add_peers/3, [Lock, Peers, Config]); 224 | do_apply_command({set_peer_roles, Lock, Peers}, Config) -> 225 | update(fun handle_set_peer_roles/3, [Lock, Peers, Config]); 226 | do_apply_command({set_settings, Settings}, Config) -> 227 | update(fun handle_set_settings/2, [Settings, Config]); 228 | do_apply_command({replace_settings, Settings}, Config) -> 229 | update(fun handle_replace_settings/2, [Settings, Config]); 230 | do_apply_command({unset_settings, Names}, Config) -> 231 | update(fun handle_unset_settings/2, [Names, Config]); 232 | do_apply_command({set_compat_version, Lock, NewVersion}, Config) -> 233 | update(fun handle_set_compat_version/3, [Lock, NewVersion, Config]); 234 | do_apply_command({put_rsm, Machine}, Config) -> 235 | update(fun handle_put_rsm/2, [Machine, Config]); 236 | do_apply_command(_, _Config) -> 237 | {reject, {error, unknown_command}}. 238 | 239 | handle_info(Msg, _StateRevision, _State, Data) -> 240 | ?WARNING("Unexpected message: ~p", [Msg]), 241 | {noreply, Data}. 242 | 243 | terminate(_Reason, _StateRevision, _State, _Data) -> 244 | ok. 245 | 246 | %% internal 247 | command(Command, Timeout) -> 248 | case chronicle_rsm:command(?NAME, Command, Timeout) of 249 | {raise, Reason} -> 250 | error(Reason); 251 | Other -> 252 | Other 253 | end. 254 | 255 | maybe_submit_config(Id, NewConfig, State, Data) -> 256 | case Data#data.is_leader of 257 | true -> 258 | submit_config(Id, NewConfig, State); 259 | false -> 260 | ok 261 | end. 262 | 263 | submit_config(Id, NewConfig, #state{config = ConfigEntry}) -> 264 | Revision = chronicle_utils:log_entry_revision(ConfigEntry), 265 | FinalConfig = chronicle_config:set_request_id(Id, NewConfig), 266 | chronicle_server:cas_config(FinalConfig, Revision). 267 | 268 | maybe_submit_current_request(State, Data) -> 269 | case State#state.current_request of 270 | undefined -> 271 | ok; 272 | {Id, _Request, _Reply, NewConfig} -> 273 | maybe_submit_config(Id, NewConfig, State, Data) 274 | end. 275 | 276 | handle_remove_peers(Lock, Peers, Config) -> 277 | check_lock(Lock, Config), 278 | check_peers(Peers), 279 | chronicle_config:remove_peers(Peers, Config). 280 | 281 | handle_add_peers(Lock, Peers, Config) -> 282 | check_lock(Lock, Config), 283 | check_peers_and_roles(Peers), 284 | chronicle_config:add_peers(Peers, Config). 285 | 286 | handle_set_peer_roles(Lock, Peers, Config) -> 287 | check_lock(Lock, Config), 288 | check_peers_and_roles(Peers), 289 | chronicle_config:set_peer_roles(Peers, Config). 290 | 291 | handle_set_settings(Settings, Config) -> 292 | check_settings(Settings), 293 | update_settings( 294 | fun (OldSettings) -> 295 | maps:merge(OldSettings, Settings) 296 | end, Config). 297 | 298 | handle_replace_settings(Settings, Config) -> 299 | check_settings(Settings), 300 | update_settings( 301 | fun (_) -> 302 | Settings 303 | end, Config). 304 | 305 | handle_unset_settings(Names, Config) -> 306 | check_settings_names(Names), 307 | update_settings( 308 | fun (Settings) -> 309 | maps:without(Names, Settings) 310 | end, Config). 311 | 312 | handle_put_rsm(Machine, Config) -> 313 | chronicle_config:put_rsm(Machine, Config). 314 | 315 | handle_set_compat_version(Lock, NewVersion, Config) -> 316 | Version = chronicle_config:get_compat_version(Config), 317 | 318 | check_lock(Lock, Config), 319 | check_compat_version(Version, NewVersion), 320 | 321 | NewConfig = chronicle_config:set_compat_version(NewVersion, Config), 322 | {ok, {ok, Version, NewVersion}, NewConfig}. 323 | 324 | check_compat_version(OldVersion, NewVersion) -> 325 | NewVersion >= ?COMPAT_VERSION orelse 326 | raise({unsupported_compat_version, NewVersion, ?COMPAT_VERSION}), 327 | 328 | NewVersion >= OldVersion orelse 329 | raise({bad_compat_version, OldVersion, NewVersion}). 330 | 331 | check_peers(Peers) -> 332 | is_list(Peers) orelse raise(bad_peers), 333 | lists:foreach( 334 | fun (Peer) -> 335 | case is_atom(Peer) of 336 | true -> 337 | ok; 338 | false -> 339 | raise(bad_peers) 340 | end 341 | end, Peers). 342 | 343 | check_peers_and_roles(PeerRoles) -> 344 | is_list(PeerRoles) orelse raise(bad_peers), 345 | 346 | Peers = [Peer || {Peer, _} <- PeerRoles], 347 | check_peers(Peers), 348 | case lists:usort(Peers) =:= lists:sort(Peers) 349 | andalso length(Peers) =:= length(PeerRoles) of 350 | true -> 351 | ok; 352 | false -> 353 | raise(bad_peers) 354 | end, 355 | 356 | lists:foreach( 357 | fun ({_Peer, Role}) -> 358 | case lists:member(Role, [replica, voter]) of 359 | true -> 360 | ok; 361 | false -> 362 | raise(bad_peers) 363 | end 364 | end, PeerRoles). 365 | 366 | check_lock(Lock, Config) -> 367 | case Lock of 368 | unlocked -> 369 | ok; 370 | _ when is_binary(Lock) -> 371 | case chronicle_config:check_lock(Lock, Config) of 372 | ok -> 373 | ok; 374 | {error, Error} -> 375 | throw(Error) 376 | end; 377 | _ -> 378 | raise(bad_lock) 379 | end. 380 | 381 | check_settings(Settings) -> 382 | is_map(Settings) orelse raise(bad_settings). 383 | 384 | check_settings_names(Names) -> 385 | is_list(Names) orelse raise(bad_settings_names). 386 | 387 | update(Fun, Args) -> 388 | try erlang:apply(Fun, Args) of 389 | {ok, NewConfig} -> 390 | {accept, ok, NewConfig}; 391 | {ok, Reply, NewConfig} -> 392 | {accept, Reply, NewConfig}; 393 | {error, _} = Error -> 394 | {reject, Error} 395 | catch 396 | throw:Error -> 397 | {reject, 398 | case Error of 399 | {raise, _} -> 400 | Error; 401 | _ -> 402 | {error, Error} 403 | end} 404 | end. 405 | 406 | -spec raise(any()) -> no_return(). 407 | raise(Reason) -> 408 | throw({raise, Reason}). 409 | 410 | update_settings(Fun, Config) -> 411 | NewSettings = Fun(chronicle_config:get_settings(Config)), 412 | {ok, chronicle_config:set_settings(NewSettings, Config)}. 413 | -------------------------------------------------------------------------------- /src/chronicle_env.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2020 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | -module(chronicle_env). 17 | 18 | -include("chronicle.hrl"). 19 | 20 | -export([data_dir/0]). 21 | -export([setup/0]). 22 | 23 | %% For use by chronicle_dump. 24 | -export([setup_logger/0]). 25 | -export([setup_decrypt_function/1]). 26 | 27 | -ifdef(TEST). 28 | -export([set_env/2]). 29 | -endif. 30 | 31 | data_dir() -> 32 | case get_env(data_dir) of 33 | {ok, Dir} -> 34 | Dir; 35 | undefined -> 36 | exit(no_data_dir) 37 | end. 38 | 39 | setup() -> 40 | ?CHECK(check_data_dir(), 41 | setup_logger_filter(), 42 | setup_logger(), 43 | setup_stats(), 44 | setup_encryption() 45 | ). 46 | 47 | check_data_dir() -> 48 | try data_dir() of 49 | _Dir -> 50 | ok 51 | catch 52 | exit:no_data_dir -> 53 | {error, {missing_parameter, data_dir}} 54 | end. 55 | 56 | get_function(Name, Arity, Default) -> 57 | case get_env(Name) of 58 | {ok, ModFun} -> 59 | case validate_function(ModFun, Arity) of 60 | {true, Fun} -> 61 | {ok, Fun}; 62 | false -> 63 | {error, {badarg, Name, ModFun}} 64 | end; 65 | undefined -> 66 | {ok, Default} 67 | end. 68 | 69 | validate_function({Mod, Fun}, Arity) -> 70 | case chronicle_utils:is_function_exported(Mod, Fun, Arity) of 71 | true -> 72 | {true, fun Mod:Fun/Arity}; 73 | false -> 74 | false 75 | end; 76 | validate_function(_, _) -> 77 | false. 78 | 79 | setup_function(Name, Arity, Default, Key) -> 80 | case get_function(Name, Arity, Default) of 81 | {ok, Fun} -> 82 | persistent_term:put(Key, Fun); 83 | {error, _} = Error -> 84 | Error 85 | end. 86 | 87 | setup_logger() -> 88 | setup_function(logger_function, 4, fun logger:log/4, ?CHRONICLE_LOGGER). 89 | 90 | setup_logger_filter() -> 91 | case get_env(setup_logger_filter, true) of 92 | true -> 93 | {ok, Modules} = application:get_key(chronicle, modules), 94 | ModulesMap = maps:from_list([{Mod, true} || Mod <- Modules]), 95 | Filter = {fun chronicle_logger_filter:filter/2, ModulesMap}, 96 | case logger:add_primary_filter(chronicle_filter, Filter) of 97 | ok -> 98 | ok; 99 | {error, {already_exist, _}} -> 100 | ok; 101 | {error, _} = Error -> 102 | Error 103 | end; 104 | false -> 105 | ok 106 | end. 107 | 108 | setup_stats() -> 109 | setup_function(stats_function, 1, 110 | fun chronicle_stats:ignore_stats/1, ?CHRONICLE_STATS). 111 | 112 | setup_encryption() -> 113 | setup_function(encrypt_function, 1, fun (Data) -> Data end, 114 | ?CHRONICLE_ENCRYPT), 115 | setup_function(decrypt_function, 1, fun (Data) -> {ok, Data} end, 116 | ?CHRONICLE_DECRYPT). 117 | 118 | setup_decrypt_function(Fun) -> 119 | persistent_term:put(?CHRONICLE_DECRYPT, Fun). 120 | 121 | -ifndef(TEST). 122 | 123 | get_env(Parameter) -> 124 | application:get_env(chronicle, Parameter). 125 | 126 | -else. 127 | 128 | peer_param(Parameter) -> 129 | case whereis(vnet) of 130 | undefined -> 131 | Parameter; 132 | Pid when is_pid(Pid) -> 133 | list_to_atom(atom_to_list(?PEER()) ++ "-" ++ 134 | atom_to_list(Parameter)) 135 | end. 136 | 137 | get_env(Parameter) -> 138 | application:get_env(chronicle, peer_param(Parameter)). 139 | 140 | set_env(Parameter, Value) -> 141 | application:set_env(chronicle, peer_param(Parameter), Value). 142 | 143 | -endif. 144 | 145 | get_env(Parameter, Default) -> 146 | case get_env(Parameter) of 147 | {ok, Value} -> 148 | Value; 149 | undefined -> 150 | Default 151 | end. 152 | -------------------------------------------------------------------------------- /src/chronicle_ets.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2020 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | -module(chronicle_ets). 17 | 18 | -behavior(gen_server). 19 | 20 | -include("chronicle.hrl"). 21 | 22 | -export([start_link/0]). 23 | -export([register_writer/1, put/2, get/1]). 24 | -export([init/1, handle_call/3, handle_cast/2, handle_info/2]). 25 | 26 | -define(SERVER, ?SERVER_NAME(?MODULE)). 27 | -define(TABLE, ?ETS_TABLE(?MODULE)). 28 | 29 | start_link() -> 30 | gen_server:start_link(?START_NAME(?MODULE), ?MODULE, [], []). 31 | 32 | register_writer(Keys) -> 33 | gen_server:call(?SERVER, {register_writer, self(), sets:from_list(Keys)}). 34 | 35 | put(Key, Value) -> 36 | %% TODO: I could actually check that whoever's writing is the owner of the 37 | %% key. 38 | ets:insert(?TABLE, {Key, Value}). 39 | 40 | get(Key) -> 41 | case ets:lookup(?TABLE, Key) of 42 | [{Key, Value}] -> 43 | {ok, Value}; 44 | [] -> 45 | not_found 46 | end. 47 | 48 | %% gen_server callbacks 49 | init([]) -> 50 | _ = ets:new(?TABLE, [public, named_table, 51 | {read_concurrency, true}, 52 | {write_concurrency, true}]), 53 | {ok, #{}}. 54 | 55 | handle_call({register_writer, Pid, Keys}, _From, State) -> 56 | handle_register_writer(Pid, Keys, State). 57 | 58 | handle_cast(Cast, State) -> 59 | {stop, {unexpected_cast, Cast}, State}. 60 | 61 | handle_info({'DOWN', _MRef, process, Pid, _Reason}, State) -> 62 | handle_down(Pid, State). 63 | 64 | %% internal 65 | handle_register_writer(Pid, Keys, Writers) -> 66 | case ?CHECK(check_not_registered(Pid, Writers), 67 | check_key_conflicts(Keys, Writers)) of 68 | {ok, NewWriters0} -> 69 | MRef = erlang:monitor(process, Pid), 70 | NewWriters = maps:put(Pid, {MRef, Keys}, NewWriters0), 71 | {reply, ok, NewWriters}; 72 | {error, _} = Error -> 73 | {reply, Error, Writers} 74 | end. 75 | 76 | check_not_registered(Pid, Writers) -> 77 | case maps:is_key(Pid, Writers) of 78 | true -> 79 | {error, already_registered}; 80 | false -> 81 | ok 82 | end. 83 | 84 | check_key_conflicts(Keys, Writers) -> 85 | try 86 | NewWriters = 87 | maps:filter( 88 | fun (Pid, {MRef, PidKeys}) -> 89 | Intersection = sets:intersection(Keys, PidKeys), 90 | case sets:size(Intersection) > 0 of 91 | true -> 92 | IntersectionList = sets:to_list(Intersection), 93 | 94 | case is_process_alive(Pid) of 95 | true -> 96 | throw({key_conflict, Pid, IntersectionList}); 97 | false -> 98 | %% The process is dead by we haven't 99 | %% received/processed the monitor 100 | %% notification yet. Other processes might 101 | %% have done so already. So don't error out 102 | %% unnecessarily. 103 | erlang:demonitor(MRef, [flush]), 104 | delete_keys(PidKeys), 105 | false 106 | end; 107 | false -> 108 | true 109 | end 110 | end, Writers), 111 | 112 | {ok, NewWriters} 113 | catch 114 | throw:{key_conflict, _Pid, _ConflictKeys} = Error -> 115 | {error, Error} 116 | end. 117 | 118 | handle_down(Pid, Writers) -> 119 | {{_, Keys}, NewWriters} = maps:take(Pid, Writers), 120 | delete_keys(Keys), 121 | {noreply, NewWriters}. 122 | 123 | delete_keys(Keys) -> 124 | lists:foreach( 125 | fun (Key) -> 126 | ets:delete(?TABLE, Key) 127 | end, sets:to_list(Keys)). 128 | -------------------------------------------------------------------------------- /src/chronicle_events.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2020 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | -module(chronicle_events). 17 | 18 | -behavior(gen_server). 19 | 20 | -include("chronicle.hrl"). 21 | 22 | -define(SERVER, ?SERVER_NAME(?MODULE)). 23 | 24 | -export([start_link/0, start_link/1]). 25 | -export([notify/1, notify/2, 26 | sync_notify/1, sync_notify/2, 27 | subscribe/1, subscribe/2]). 28 | 29 | -export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2]). 30 | 31 | start_link() -> 32 | start_link(?MODULE). 33 | 34 | start_link(Name) -> 35 | gen_server:start_link(?START_NAME(Name), ?MODULE, [], []). 36 | 37 | notify(Event) -> 38 | notify(?MODULE, Event). 39 | 40 | notify(Name, Event) -> 41 | gen_server:cast(?SERVER_NAME(Name), {notify, Event}). 42 | 43 | sync_notify(Event) -> 44 | sync_notify(?MODULE, Event). 45 | 46 | sync_notify(Name, Event) -> 47 | gen_server:call(?SERVER_NAME(Name), {sync_notify, Event}, infinity). 48 | 49 | subscribe(Handler) -> 50 | subscribe(?MODULE, Handler). 51 | 52 | subscribe(Name, Handler) -> 53 | gen_server:call(?SERVER_NAME(Name), {subscribe, self(), Handler}, infinity). 54 | 55 | %% callbacks 56 | init([]) -> 57 | process_flag(trap_exit, true), 58 | {ok, #{}}. 59 | 60 | handle_call({sync_notify, Event}, _From, Watchers) -> 61 | {reply, ok, notify_watchers(Event, Watchers)}; 62 | handle_call({subscribe, Pid, Handler}, _From, Watchers) -> 63 | {reply, self(), add_watcher(Pid, Handler, Watchers)}; 64 | handle_call(_Call, _From, Watchers) -> 65 | {reply, nack, Watchers}. 66 | 67 | handle_cast({notify, Event}, Watchers) -> 68 | {noreply, notify_watchers(Event, Watchers)}; 69 | handle_cast(Cast, Watchers) -> 70 | ?WARNING("Unexpected cast ~p", [Cast]), 71 | {noreply, Watchers}. 72 | 73 | handle_info({'EXIT', Pid, _Reason} = Exit, Watchers) -> 74 | case remove_watcher(Pid, Watchers) of 75 | {ok, NewWatchers} -> 76 | {noreply, NewWatchers}; 77 | error -> 78 | {stop, {unknown_process_died, Exit}, Watchers} 79 | end; 80 | handle_info(Msg, Watchers) -> 81 | ?WARNING("Received unexpected message ~p", [Msg]), 82 | {noreply, Watchers}. 83 | 84 | terminate(Reason, Watchers) -> 85 | lists:foreach( 86 | fun (Pid) -> 87 | terminate_watcher(Pid, {shutdown, {?MODULE, Reason}}) 88 | end, maps:keys(Watchers)). 89 | 90 | %% internal 91 | add_watcher(Pid, Handler, Watchers) -> 92 | link(Pid), 93 | maps:update_with(Pid, 94 | fun (Handlers) -> 95 | [Handler | Handlers] 96 | end, [Handler], Watchers). 97 | 98 | notify_watchers(Event, Watchers) -> 99 | Failed = 100 | maps:fold( 101 | fun (Pid, Handlers, Acc) -> 102 | try 103 | lists:foreach( 104 | fun (Handler) -> 105 | Handler(Event) 106 | end, Handlers), 107 | Acc 108 | catch 109 | T:E:Stack -> 110 | Reason = {handler_crashed, {T, E, Stack}}, 111 | terminate_watcher(Pid, Reason), 112 | [Pid | Acc] 113 | end 114 | end, [], Watchers), 115 | 116 | maps:without(Failed, Watchers). 117 | 118 | remove_watcher(Pid, Watchers) -> 119 | case maps:take(Pid, Watchers) of 120 | {_, NewWatchers} -> 121 | {ok, NewWatchers}; 122 | error -> 123 | error 124 | end. 125 | 126 | terminate_watcher(Pid, Reason) -> 127 | true = (Reason =/= normal), 128 | exit(Pid, Reason), 129 | unlink(Pid), 130 | receive 131 | {'EXIT', Pid, _} -> 132 | ok 133 | after 134 | 0 -> 135 | ok 136 | end. 137 | -------------------------------------------------------------------------------- /src/chronicle_failover.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2020 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | -module(chronicle_failover). 17 | 18 | -include("chronicle.hrl"). 19 | 20 | -behavior(gen_server). 21 | 22 | -export([start_link/0]). 23 | -export([failover/1, failover/2, try_cancel/2]). 24 | 25 | -export([init/1, handle_call/3, handle_cast/2]). 26 | 27 | -export_type([failover_result/0, try_cancel_result/0]). 28 | 29 | -define(SERVER, ?SERVER_NAME(?MODULE)). 30 | 31 | -define(STORE_BRANCH_TIMEOUT, 32 | chronicle_settings:get({failover, store_branch_timeout}, 15000)). 33 | -define(WAIT_LEADER_TIMEOUT, 34 | chronicle_settings:get({failover, wait_leader_timeout}, 10000)). 35 | -define(CANCEL_BRANCH_TIMEOUT, 36 | chronicle_settings:get({failover, cancel_branch_timeout}, 15000)). 37 | -define(CLEANUP_BRANCH_TIMEOUT, 38 | chronicle_settings:get({failover, cleanup_branch_timeout}, 5000)). 39 | 40 | -record(state, {}). 41 | 42 | start_link() -> 43 | gen_server:start_link(?START_NAME(?MODULE), ?MODULE, [], []). 44 | 45 | -type failover_result() :: ok | {error, failover_error()}. 46 | -type failover_error() :: {not_in_peers, chronicle:peer(), [chronicle:peer()]} 47 | | {aborted, #{diverged_peers => [chronicle:peer()], 48 | failed_peers => [chronicle:peer()]}} 49 | | no_leader. 50 | 51 | -spec failover([chronicle:peer()]) -> failover_result(). 52 | failover(KeepPeers) -> 53 | failover(KeepPeers, undefined). 54 | 55 | -spec failover([chronicle:peer()], Opaque::any()) -> failover_result(). 56 | failover(KeepPeers, Opaque) -> 57 | case gen_server:call(?SERVER, {failover, KeepPeers, Opaque}, infinity) of 58 | {ok, HistoryId} -> 59 | %% Make sure chronicle_leader is aware of the branch stored locally. 60 | ok = chronicle_leader:sync(), 61 | 62 | %% Wait for leader to get elected. This doesn't guarantee that 63 | %% everything went smoothly. But it's a close approximation. 64 | try chronicle_leader:wait_for_leader(?WAIT_LEADER_TIMEOUT) of 65 | {_Leader, {LeaderHistoryId, _}} 66 | when LeaderHistoryId =:= HistoryId -> 67 | ok; 68 | _ -> 69 | {error, no_leader} 70 | catch 71 | exit:no_leader -> 72 | {error, no_leader} 73 | end; 74 | {error, _} = Error -> 75 | Error 76 | end. 77 | 78 | -type try_cancel_result() :: ok 79 | | {error, {failed_peers, [chronicle:peer()]}}. 80 | 81 | -spec try_cancel(chronicle:history_id(), [chronicle:peer()]) -> 82 | ok | {error, {failed_peers, [chronicle:peer()]}}. 83 | try_cancel(BranchId, Peers) -> 84 | gen_server:call(?SERVER, {try_cancel, BranchId, Peers}, infinity). 85 | 86 | %% gen_server callbacks 87 | init([]) -> 88 | {ok, #state{}}. 89 | 90 | handle_call({failover, KeepPeers, Opaque}, _From, State) -> 91 | handle_failover(KeepPeers, Opaque, State); 92 | handle_call({try_cancel, BranchId, Peers}, _From, State) -> 93 | handle_try_cancel(BranchId, Peers, State); 94 | handle_call(_Call, _From, State) -> 95 | {reply, nack, State}. 96 | 97 | handle_cast(Cast, State) -> 98 | ?WARNING("Unexpected cast ~p.~nState:~n~p", 99 | [Cast, State]), 100 | {noreply, State}. 101 | 102 | %% internal 103 | handle_failover(KeepPeers, Opaque, State) -> 104 | Metadata = chronicle_agent:get_metadata(), 105 | NewHistoryId = chronicle_utils:random_uuid(), 106 | Reply = prepare_branch(KeepPeers, Opaque, NewHistoryId, Metadata), 107 | {reply, Reply, State}. 108 | 109 | handle_try_cancel(BranchId, Peers, State) -> 110 | {reply, cancel_branch(BranchId, Peers), State}. 111 | 112 | prepare_branch(KeepPeers, Opaque, NewHistoryId, Metadata) -> 113 | #metadata{peer = Self, history_id = OldHistoryId} = Metadata, 114 | case lists:member(Self, KeepPeers) of 115 | true -> 116 | Branch = #branch{history_id = NewHistoryId, 117 | old_history_id = OldHistoryId, 118 | coordinator = Self, 119 | peers = KeepPeers, 120 | opaque = Opaque}, 121 | Followers = KeepPeers -- [Self], 122 | 123 | case store_branch(Followers, Branch) of 124 | ok -> 125 | case local_store_branch(Branch) of 126 | ok -> 127 | {ok, NewHistoryId}; 128 | {error, Error} -> 129 | ?WARNING("Failed to store branch locallly.~n" 130 | "Branch:~n~p~n" 131 | "Error: ~p", 132 | [Branch, Error]), 133 | 134 | %% All errors are clean errors currently. So we 135 | %% make an attempt to undo the branch on the 136 | %% followers. 137 | cleanup_branch(Branch, Followers), 138 | {error, {aborted, #{failed_peers => [Self]}}} 139 | end; 140 | {error, _} = Error -> 141 | %% Attempt to undo the branch. 142 | cleanup_branch(Branch, Followers), 143 | Error 144 | end; 145 | false -> 146 | {error, {not_in_peers, Self, KeepPeers}} 147 | end. 148 | 149 | local_store_branch(Branch) -> 150 | ?DEBUG("Setting local brach:~n~p", [Branch]), 151 | chronicle_agent:local_store_branch(Branch, ?STORE_BRANCH_TIMEOUT). 152 | 153 | store_branch(Peers, Branch) -> 154 | ?DEBUG("Setting branch.~n" 155 | "Peers: ~w~n" 156 | "Branch:~n~p", 157 | [Peers, Branch]), 158 | 159 | {_Ok, Errors} = 160 | chronicle_agent:store_branch(Peers, Branch, ?STORE_BRANCH_TIMEOUT), 161 | case maps:size(Errors) =:= 0 of 162 | true -> 163 | ok; 164 | false -> 165 | ?WARNING("Failed to store branch on some peers.~n" 166 | "Branch:~n~p~n" 167 | "Errors:~n~p", 168 | [Branch, Errors]), 169 | {error, {aborted, massage_errors(Errors)}} 170 | end. 171 | 172 | massage_errors(Errors) -> 173 | chronicle_utils:groupby_map( 174 | fun ({Peer, Error}) -> 175 | case Error of 176 | {error, {history_mismatch, _}} -> 177 | {diverged_peers, Peer}; 178 | _ -> 179 | {failed_peers, Peer} 180 | end 181 | end, maps:to_list(Errors)). 182 | 183 | cancel_branch(BranchId, Peers) -> 184 | undo_branch(BranchId, Peers, ?CANCEL_BRANCH_TIMEOUT). 185 | 186 | cleanup_branch(#branch{history_id = BranchId}, Peers) -> 187 | _ = undo_branch(BranchId, Peers, ?CLEANUP_BRANCH_TIMEOUT), 188 | ok. 189 | 190 | undo_branch(BranchId, Peers, Timeout) -> 191 | ?DEBUG("Undoing branch.~n" 192 | "Branch id: ~w~n" 193 | "Peers: ~w", 194 | [BranchId, Peers]), 195 | 196 | {_Ok, Bad} = chronicle_agent:undo_branch(Peers, BranchId, Timeout), 197 | Errors = maps:filter( 198 | fun (_, Error) -> 199 | case Error of 200 | {error, no_branch} -> 201 | %% No branch found. 202 | false; 203 | {error, {bad_branch, _}} -> 204 | %% Branch superseded by another one. 205 | false; 206 | _ -> 207 | true 208 | end 209 | end, Bad), 210 | 211 | case maps:size(Errors) =:= 0 of 212 | true -> 213 | ?DEBUG("Branch undone successfully."), 214 | ok; 215 | false -> 216 | ?WARNING("Failed to undo branch on some nodes:~n~p", [Errors]), 217 | {error, {failed_peers, [maps:keys(Errors)]}} 218 | end. 219 | -------------------------------------------------------------------------------- /src/chronicle_log.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2020 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | -module(chronicle_log). 17 | 18 | -include("chronicle.hrl"). 19 | 20 | -export([open/1, open/4, read_log/4, sync/1, 21 | close/1, create/2, append/2, data_size/1]). 22 | 23 | -define(MAGIC, <<"chronicle">>). 24 | -define(MAGIC_BYTES, 9). 25 | -define(LOG_VERSION, 1). 26 | -define(HEADER_BYTES, (?MAGIC_BYTES + 1)). 27 | 28 | -define(READ_CHUNK_SIZE, 1024 * 1024). 29 | -define(WRITE_CHUNK_SIZE, 1024 * 1024). 30 | 31 | -define(TERM_SIZE_BITS, 32). 32 | -define(TERM_SIZE_BYTES, (?TERM_SIZE_BITS bsr 3)). 33 | -define(TERM_SIZE_MAX, (1 bsl ?TERM_SIZE_BITS) - 1). 34 | -define(TERM_HEADER_BYTES, (?CRC_BYTES + ?TERM_SIZE_BYTES)). 35 | 36 | -record(log, { path, 37 | fd, 38 | mode, 39 | start_pos }). 40 | 41 | open(Path) -> 42 | case open_int(Path, write) of 43 | {ok, #log{fd = Fd} = Log, _} -> 44 | case file:position(Fd, eof) of 45 | {ok, _} -> 46 | {ok, Log}; 47 | {error, _} = Error -> 48 | Error 49 | end; 50 | {error, _} = Error -> 51 | Error 52 | end. 53 | 54 | open(Path, UserDataFun, LogEntryFun, State) -> 55 | case open_int(Path, write) of 56 | {ok, Log, UserData} -> 57 | try scan(Log, LogEntryFun, 58 | UserDataFun(UserData, State), #{repair => true}) of 59 | {ok, NewState} -> 60 | {ok, Log, NewState}; 61 | {error, _} = Error -> 62 | ok = close(Log), 63 | Error 64 | catch 65 | T:E:Stack -> 66 | ok = close(Log), 67 | erlang:raise(T, E, Stack) 68 | end; 69 | {error, _} = Error -> 70 | Error 71 | end. 72 | 73 | open_int(Path, Mode) -> 74 | case file:open(Path, open_flags(Mode)) of 75 | {ok, Fd} -> 76 | case read_header(Fd) of 77 | {ok, UserData} -> 78 | {ok, make_log(Path, Fd, Mode), UserData}; 79 | {error, _} = Error -> 80 | ok = file:close(Fd), 81 | Error 82 | end; 83 | {error, _} = Error -> 84 | Error 85 | end. 86 | 87 | read_log(Path, UserDataFun, LogEntryFun, State) -> 88 | case open_int(Path, read) of 89 | {ok, Log, UserData} -> 90 | try 91 | scan(Log, LogEntryFun, UserDataFun(UserData, State)) 92 | after 93 | close(Log) 94 | end; 95 | {error, _} = Error -> 96 | Error 97 | end. 98 | 99 | sync(#log{fd = Fd}) -> 100 | %% TODO: preallocate log when possible and use fdatasync instead of sync. 101 | file:sync(Fd). 102 | 103 | close(#log{fd = Fd})-> 104 | file:close(Fd). 105 | 106 | create(Path, UserData) -> 107 | Mode = write, 108 | case file:open(Path, open_flags(Mode)) of 109 | {ok, Fd} -> 110 | %% Truncate the file if we're recreating it. 111 | ok = truncate(Fd, 0), 112 | ok = write_header(Fd, UserData), 113 | ok = file:sync(Fd), 114 | ok = chronicle_utils:sync_dir(filename:dirname(Path)), 115 | {ok, make_log(Path, Fd, Mode)}; 116 | {error, _} = Error -> 117 | Error 118 | end. 119 | 120 | read_header(Fd) -> 121 | Size = ?HEADER_BYTES + ?TERM_HEADER_BYTES, 122 | case chronicle_utils:read_full(Fd, Size) of 123 | {ok, <>} -> 125 | case check_header_data(HeaderData) of 126 | ok -> 127 | read_header_user_data(Fd, TermHeader); 128 | {error, _} = Error -> 129 | Error 130 | end; 131 | eof -> 132 | {error, no_header}; 133 | {error, _} = Error -> 134 | Error 135 | end. 136 | 137 | read_header_user_data(Fd, TermHeader) -> 138 | case decode_entry_size(TermHeader) of 139 | {ok, Size, _, <<>>} -> 140 | FullSize = ?CRC_BYTES + Size, 141 | case chronicle_utils:read_full(Fd, FullSize) of 142 | {ok, TermData} -> 143 | case decode_entry_term(TermData, Size) of 144 | {ok, UserData, _, <<>>} -> 145 | {ok, UserData}; 146 | corrupt -> 147 | {error, {corrupt_log, bad_header_user_data}} 148 | end; 149 | eof -> 150 | {error, no_header}; 151 | {error, _} = Error -> 152 | Error 153 | end; 154 | corrupt -> 155 | {error, {corrupt_log, bad_header_user_data}} 156 | end. 157 | 158 | check_header_data(Data) -> 159 | case Data of 160 | <> -> 161 | case MaybeMagic =:= ?MAGIC of 162 | true -> 163 | case Version =:= ?LOG_VERSION of 164 | true -> 165 | ok; 166 | false -> 167 | {error, {unsupported_version, Version}} 168 | end; 169 | false -> 170 | {error, {corrupt_log, bad_header}} 171 | end 172 | end. 173 | 174 | write_header(Fd, UserData) -> 175 | Header = <>, 176 | file:write(Fd, encode_term(UserData, Header)). 177 | 178 | append(#log{mode = write, fd = Fd}, Terms) -> 179 | encode_terms(Terms, 180 | fun (Data) -> 181 | file:write(Fd, Data) 182 | end). 183 | 184 | data_size(#log{fd = Fd, start_pos = HeaderSize}) -> 185 | case file:position(Fd, cur) of 186 | {ok, Size} -> 187 | DataSize = Size - HeaderSize, 188 | true = (DataSize >= 0), 189 | {ok, DataSize}; 190 | {error, _} = Error -> 191 | Error 192 | end. 193 | 194 | encode_terms(Terms, Fun) -> 195 | encode_terms(Terms, <<>>, 0, Fun). 196 | 197 | encode_terms([], AccData, AccWritten, Fun) -> 198 | case Fun(AccData) of 199 | ok -> 200 | {ok, byte_size(AccData) + AccWritten}; 201 | Other -> 202 | Other 203 | end; 204 | encode_terms([Term|Terms], AccData, AccWritten, Fun) -> 205 | NewAccData = encode_term(Term, AccData), 206 | Size = byte_size(NewAccData), 207 | case Size >= ?WRITE_CHUNK_SIZE of 208 | true -> 209 | case Fun(NewAccData) of 210 | ok -> 211 | encode_terms(Terms, <<>>, AccWritten + Size, Fun); 212 | Other -> 213 | Other 214 | end; 215 | false -> 216 | encode_terms(Terms, NewAccData, AccWritten, Fun) 217 | end. 218 | 219 | encode_term(Term, AccData) -> 220 | TermBinary = ?ENCRYPT(term_to_binary(Term)), 221 | Size = byte_size(TermBinary), 222 | true = (Size =< ?TERM_SIZE_MAX), 223 | 224 | SizeEncoded = <>, 225 | SizeCrc = erlang:crc32(SizeEncoded), 226 | TermCrc = erlang:crc32(TermBinary), 227 | 228 | <>. 233 | 234 | make_log(Path, Fd, Mode) -> 235 | {ok, Pos} = file:position(Fd, cur), 236 | #log{path = Path, fd = Fd, mode = Mode, start_pos = Pos}. 237 | 238 | truncate(Fd, Pos) -> 239 | case file:position(Fd, Pos) of 240 | {ok, _} -> 241 | file:truncate(Fd); 242 | {error, _} = Error -> 243 | Error 244 | end. 245 | 246 | scan(Log, Fun, State) -> 247 | scan(Log, Fun, State, #{}). 248 | 249 | scan(#log{path = Path, fd = Fd, start_pos = Pos}, Fun, State, Opts) -> 250 | case file:position(Fd, Pos) of 251 | {ok, ActualPos} -> 252 | true = (Pos =:= ActualPos), 253 | scan_loop(Path, Fd, Pos, <<>>, Fun, State, Opts); 254 | {error, _} = Error -> 255 | Error 256 | end. 257 | 258 | scan_loop(Path, Fd, Pos, AccData, Fun, State, Opts) -> 259 | case file:read(Fd, ?READ_CHUNK_SIZE) of 260 | {ok, ReadData} -> 261 | Data = <>, 262 | case scan_chunk(Pos, Data, Fun, State) of 263 | {ok, NewPos, DataLeft, NewState} -> 264 | scan_loop(Path, Fd, NewPos, DataLeft, Fun, NewState, Opts); 265 | {error, _} = Error -> 266 | Error 267 | end; 268 | eof -> 269 | case AccData of 270 | <<>> -> 271 | {ok, State}; 272 | _ -> 273 | maybe_repair(Path, Fd, Pos, State, Opts) 274 | end; 275 | {error, _} = Error -> 276 | Error 277 | end. 278 | 279 | maybe_repair(Path, Fd, Pos, State, Opts) -> 280 | {ok, CurPos} = file:position(Fd, cur), 281 | 282 | ?WARNING("Unexpected end of file in '~s' at ~b.", [Path, CurPos]), 283 | 284 | case maps:get(repair, Opts, false) of 285 | true -> 286 | case truncate(Fd, Pos) of 287 | ok -> 288 | ?INFO("Truncated '~s' to ~b", [Path, Pos]), 289 | {ok, State}; 290 | {error, Error} -> 291 | ?ERROR("Could not truncate '~s' to ~b: ~p", 292 | [Path, Pos, Error]), 293 | {error, {truncate_failed, Error}} 294 | end; 295 | false -> 296 | ?ERROR("Not attempting to repair '~s'.", [Path]), 297 | {error, {unexpected_eof, Pos}} 298 | end. 299 | 300 | scan_chunk(Pos, Data, Fun, State) -> 301 | case decode_entry(Data) of 302 | {ok, Term, BytesConsumed, NewData} -> 303 | NewState = Fun(Term, State), 304 | NewPos = Pos + BytesConsumed, 305 | scan_chunk(NewPos, NewData, Fun, NewState); 306 | need_more_data -> 307 | {ok, Pos, Data, State}; 308 | corrupt -> 309 | {error, {corrupt_log, {bad_entry, Pos}}} 310 | end. 311 | 312 | decode_entry(Data) -> 313 | case decode_entry_size(Data) of 314 | {ok, Size, Consumed0, NewData0} -> 315 | case decode_entry_term(NewData0, Size) of 316 | {ok, Term, Consumed1, NewData} -> 317 | {ok, Term, Consumed0 + Consumed1, NewData}; 318 | Error -> 319 | Error 320 | end; 321 | Error -> 322 | Error 323 | end. 324 | 325 | decode_entry_size(Data) -> 326 | case get_crc_data(Data, ?TERM_SIZE_BYTES) of 327 | {ok, <>, Consumed, NewData} -> 328 | {ok, Size, Consumed, NewData}; 329 | Error -> 330 | Error 331 | end. 332 | 333 | decode_entry_term(Data, Size) -> 334 | case get_crc_data(Data, Size) of 335 | {ok, TermBinary, Consumed, NewData} -> 336 | case ?DECRYPT(TermBinary) of 337 | {ok, DecryptedTermBinary} -> 338 | Term = binary_to_term(DecryptedTermBinary), 339 | {ok, Term, Consumed, NewData}; 340 | {error, decrypt_error} -> 341 | {error, corrupt} 342 | end; 343 | Error -> 344 | Error 345 | end. 346 | 347 | open_flags(Mode) -> 348 | Flags = [raw, binary, read], 349 | case Mode of 350 | read -> 351 | Flags; 352 | write -> 353 | [append | Flags] 354 | end. 355 | 356 | get_crc_data(Data, Size) -> 357 | NeedSize = ?CRC_BYTES + Size, 358 | case byte_size(Data) < NeedSize of 359 | true -> 360 | need_more_data; 361 | false -> 362 | <> = Data, 365 | case erlang:crc32(Payload) =:= Crc of 366 | true -> 367 | {ok, Payload, NeedSize, RestData}; 368 | false -> 369 | corrupt 370 | end 371 | end. 372 | -------------------------------------------------------------------------------- /src/chronicle_logger_filter.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2020 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | -module(chronicle_logger_filter). 17 | 18 | -export([filter/2]). 19 | 20 | filter(Event, Modules) -> 21 | case maps:find(msg, Event) of 22 | {ok, {report, Report}} when is_map(Report) -> 23 | case maps:find(label, Report) of 24 | {ok, Label} -> 25 | Action = 26 | case Label of 27 | {gen_statem, terminate} -> 28 | gen_statem_filter(Report, Modules); 29 | {proc_lib, crash} -> 30 | proc_lib_filter(Report, Modules); 31 | {supervisor, Error} 32 | when Error =:= child_terminated; 33 | Error =:= start_error; 34 | Error =:= shutdown_error; 35 | Error =:= shutdown -> 36 | supervisor_filter(Report, Modules); 37 | _ -> 38 | ignore 39 | end, 40 | 41 | case Action of 42 | ignore -> 43 | ignore; 44 | NewReport -> 45 | Event#{msg => {report, NewReport}} 46 | end; 47 | error -> 48 | ignore 49 | end; 50 | _ -> 51 | ignore 52 | end. 53 | 54 | gen_statem_filter(Report, Modules) -> 55 | case maps:find(modules, Report) of 56 | {ok, [Module|_]} when is_map_key(Module, Modules) -> 57 | case Report of 58 | #{queue := Queue, 59 | postponed := Postponed, 60 | reason := {Class, Reason, Stack}} -> 61 | NewQueue = sanitize_events(Module, Queue), 62 | NewPostponed = sanitize_events(Module, Postponed), 63 | NewReason = chronicle_utils:sanitize_reason(Reason), 64 | NewStack = chronicle_utils:sanitize_stacktrace(Stack), 65 | 66 | Report#{queue => NewQueue, 67 | postponed => NewPostponed, 68 | reason => {Class, NewReason, NewStack}}; 69 | _ -> 70 | ignore 71 | end; 72 | _ -> 73 | ignore 74 | end. 75 | 76 | sanitize_events(Module, Events) -> 77 | case erlang:function_exported(Module, sanitize_event, 2) of 78 | true -> 79 | [try Module:sanitize_event(Type, Event) of 80 | Sanitized -> Sanitized 81 | catch 82 | _:_ -> 83 | {crashed, {Module, sanitize_event, 2}} 84 | end || {Type, Event} <- Events]; 85 | false -> 86 | Events 87 | end. 88 | 89 | proc_lib_filter(Report, Modules) -> 90 | case maps:find(report, Report) of 91 | {ok, [Info | Rest]} when is_list(Info) -> 92 | case lists:keyfind(initial_call, 1, Info) of 93 | {_, {Module, init, _}} when is_map_key(Module, Modules) -> 94 | %% Messages can be large and may contain sensitive 95 | %% information. 96 | NewInfo0 = lists:keyreplace(messages, 1, Info, 97 | {messages, omitted}), 98 | NewInfo1 = 99 | case lists:keyfind(error_info, 1, NewInfo0) of 100 | {_, {Class, Reason, Stack}} -> 101 | NewReason = 102 | chronicle_utils:sanitize_reason(Reason), 103 | NewStack = 104 | chronicle_utils:sanitize_stacktrace(Stack), 105 | 106 | lists:keyreplace(error_info, 1, NewInfo0, 107 | {error_info, 108 | {Class, 109 | NewReason, NewStack}}); 110 | _ -> 111 | NewInfo0 112 | end, 113 | 114 | Report#{report => [NewInfo1 | Rest]}; 115 | _ -> 116 | ignore 117 | end; 118 | _ -> 119 | ignore 120 | end. 121 | 122 | supervisor_filter(Report, Modules) -> 123 | case maps:find(report, Report) of 124 | {ok, Info} when is_list(Info) -> 125 | case lists:keyfind(supervisor, 1, Info) of 126 | {_, {_, Module}} when is_map_key(Module, Modules) -> 127 | case lists:keyfind(reason, 1, Info) of 128 | {_, Reason} -> 129 | NewReason = chronicle_utils:sanitize_reason(Reason), 130 | NewInfo = lists:keyreplace(reason, 1, Info, 131 | {reason, NewReason}), 132 | Report#{report => NewInfo}; 133 | false -> 134 | ignore 135 | end; 136 | _ -> 137 | ignore 138 | end; 139 | _ -> 140 | ignore 141 | end. 142 | -------------------------------------------------------------------------------- /src/chronicle_peers.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2020 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | -module(chronicle_peers). 17 | 18 | -export([start_link/0]). 19 | -export([get_live_peers/0, get_live_peers/1, get_live_peers_other/0]). 20 | -export([monitor/0]). 21 | 22 | -ifndef(TEST). 23 | 24 | start_link() -> 25 | ignore. 26 | 27 | get_live_peers() -> 28 | lists:sort(nodes([this, visible])). 29 | 30 | get_live_peers_other() -> 31 | lists:sort(nodes()). 32 | 33 | monitor() -> 34 | ok = net_kernel:monitor_nodes(true, [nodedown_reason]). 35 | 36 | -else. % -ifndef(TEST) 37 | 38 | start_link() -> 39 | chronicle_peers_vnet:start_link(). 40 | 41 | get_live_peers() -> 42 | chronicle_peers_vnet:get_live_peers(). 43 | 44 | get_live_peers_other() -> 45 | get_live_peers() -- [vnet:vnode()]. 46 | 47 | monitor() -> 48 | chronicle_peers_vnet:monitor(). 49 | 50 | -endif. 51 | 52 | get_live_peers(Peers) -> 53 | ordsets:intersection(get_live_peers(), lists:usort(Peers)). 54 | -------------------------------------------------------------------------------- /src/chronicle_rsm_sup.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2020 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | %% A supervisor for processes that require the system to be provisioned to 17 | %% run. 18 | -module(chronicle_rsm_sup). 19 | 20 | -behavior(dynamic_supervisor). 21 | 22 | -include("chronicle.hrl"). 23 | 24 | -export([start_link/0]). 25 | -export([init/1, handle_event/2, child_specs/1]). 26 | 27 | -record(state, { peer_id :: chronicle:peer_id(), 28 | config :: #config{} }). 29 | 30 | start_link() -> 31 | dynamic_supervisor:start_link(?START_NAME(?MODULE), ?MODULE, []). 32 | 33 | %% callbacks 34 | init([]) -> 35 | Self = self(), 36 | chronicle_events:subscribe( 37 | fun (Event) -> 38 | case Event of 39 | {new_config, Config, _} -> 40 | %% TODO: this is going to wake up the process needlessly 41 | %% all the time; having more granular events 42 | dynamic_supervisor:send_event(Self, {new_config, Config}); 43 | _ -> 44 | ok 45 | end 46 | end), 47 | 48 | Metadata = chronicle_agent:get_metadata(), 49 | 50 | %% TODO: reconsider the strategy 51 | Flags = #{strategy => one_for_one, 52 | intensity => 3, 53 | period => 10}, 54 | 55 | PeerId = Metadata#metadata.peer_id, 56 | Config = chronicle_utils:get_config(Metadata), 57 | {ok, Flags, #state{peer_id = PeerId, 58 | config = Config}}. 59 | 60 | handle_event({new_config, Config}, State) -> 61 | {noreply, State#state{config = Config}}. 62 | 63 | child_specs(#state{peer_id = PeerId, config = Config}) -> 64 | RSMs = chronicle_config:get_rsms(Config), 65 | lists:map( 66 | fun ({Name, #rsm_config{module = Module, args = Args}}) -> 67 | #{id => Name, 68 | start => {chronicle_single_rsm_sup, 69 | start_link, 70 | [Name, PeerId, Module, Args]}, 71 | restart => permanent, 72 | type => supervisor} 73 | end, maps:to_list(RSMs)). 74 | -------------------------------------------------------------------------------- /src/chronicle_secondary_restartable_sup.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2021 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | -module(chronicle_secondary_restartable_sup). 17 | 18 | -behavior(supervisor). 19 | 20 | -include("chronicle.hrl"). 21 | 22 | -export([start_link/0]). 23 | -export([init/1]). 24 | 25 | start_link() -> 26 | supervisor:start_link(?START_NAME(?MODULE), ?MODULE, []). 27 | 28 | %% callbacks 29 | init([]) -> 30 | Flags = #{strategy => one_for_one, 31 | intensity => 3, 32 | period => 10}, 33 | {ok, {Flags, child_specs()}}. 34 | 35 | child_specs() -> 36 | Status = #{id => chronicle_status, 37 | start => {chronicle_status, start_link, []}, 38 | restart => permanent, 39 | shutdown => brutal_kill, 40 | type => worker}, 41 | 42 | Failover = #{id => chronicle_failover, 43 | start => {chronicle_failover, start_link, []}, 44 | restart => permanent, 45 | shutdown => 5000, 46 | type => worker}, 47 | 48 | [Status, Failover]. 49 | -------------------------------------------------------------------------------- /src/chronicle_secondary_sup.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2020 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | %% A supervisor for processes that require the system to be provisioned to 17 | %% run. 18 | -module(chronicle_secondary_sup). 19 | 20 | -behavior(dynamic_supervisor). 21 | 22 | -include("chronicle.hrl"). 23 | 24 | -export([start_link/0]). 25 | -export([sync/0]). 26 | -export([init/1, handle_event/2, child_specs/1]). 27 | 28 | start_link() -> 29 | dynamic_supervisor:start_link(?START_NAME(?MODULE), ?MODULE, []). 30 | 31 | sync() -> 32 | ok = dynamic_supervisor:sync(?SERVER_NAME(?MODULE), 20000). 33 | 34 | %% callbacks 35 | init([]) -> 36 | Self = self(), 37 | chronicle_events:subscribe( 38 | fun (Event) -> 39 | case Event of 40 | {system_event, wiping, _} -> 41 | dynamic_supervisor:send_event(Self, wipe_requested); 42 | {system_state, NewState, _} -> 43 | dynamic_supervisor:send_event(Self, {state, NewState}); 44 | _ -> 45 | ok 46 | end 47 | end), 48 | 49 | State = 50 | case chronicle_agent:is_wipe_requested() of 51 | true -> 52 | not_provisioned; 53 | false -> 54 | case chronicle_agent:get_system_state() of 55 | {provisioned, _} -> 56 | provisioned; 57 | {removed, _} -> 58 | removed; 59 | {joining_cluster, _} -> 60 | joining_cluster; 61 | not_provisioned -> 62 | not_provisioned 63 | end 64 | end, 65 | 66 | %% TODO: reconsider the strategy 67 | Flags = #{strategy => one_for_all, 68 | intensity => 3, 69 | period => 10}, 70 | {ok, Flags, State}. 71 | 72 | handle_event(wipe_requested, _) -> 73 | {noreply, not_provisioned}; 74 | handle_event({state, NewState}, _) -> 75 | {noreply, NewState}. 76 | 77 | %% TODO: revise shutdown specifications 78 | child_specs(not_provisioned) -> 79 | []; 80 | child_specs(joining_cluster) -> 81 | Leader = #{id => chronicle_leader, 82 | start => {chronicle_leader, start_link, []}, 83 | restart => permanent, 84 | shutdown => 5000, 85 | type => worker}, 86 | 87 | [Leader]; 88 | child_specs(removed) -> 89 | child_specs(provisioned); 90 | child_specs(provisioned) -> 91 | RestartableSup = #{id => chronicle_secondary_restartable_sup, 92 | start => {chronicle_secondary_restartable_sup, 93 | start_link, []}, 94 | restart => permanent, 95 | shutdown => infinity, 96 | type => supervisor}, 97 | 98 | Server = #{id => chronicle_server, 99 | start => {chronicle_server, start_link, []}, 100 | restart => permanent, 101 | shutdown => 5000, 102 | type => worker}, 103 | 104 | RSMSup = #{id => chronicle_rsm_sup, 105 | start => {chronicle_rsm_sup, start_link, []}, 106 | restart => permanent, 107 | shutdown => infinity, 108 | type => supervisor}, 109 | 110 | child_specs(joining_cluster) ++ [RestartableSup, Server, RSMSup]. 111 | -------------------------------------------------------------------------------- /src/chronicle_settings.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2021 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | -module(chronicle_settings). 17 | 18 | -behavior(gen_server). 19 | 20 | -include("chronicle.hrl"). 21 | 22 | -ifdef(TEST). 23 | -include_lib("eunit/include/eunit.hrl"). 24 | -endif. 25 | 26 | -export([start_link/0]). 27 | -export([get/2, get_settings/0, set_settings/1, set_local_settings/1]). 28 | 29 | -export([init/1, handle_call/3, handle_cast/2]). 30 | 31 | -define(SERVER, ?SERVER_NAME(?MODULE)). 32 | -define(TABLE, ?ETS_TABLE(?MODULE)). 33 | 34 | -record(state, { settings = #{}, 35 | local_settings = #{}, 36 | effective_settings = #{} }). 37 | 38 | start_link() -> 39 | gen_server:start_link(?START_NAME(?MODULE), ?MODULE, [], []). 40 | 41 | get(Name, Default) -> 42 | case ets:lookup(?TABLE, Name) of 43 | [] -> 44 | Default; 45 | [{_, Value}] -> 46 | Value 47 | end. 48 | 49 | get_settings() -> 50 | gen_server:call(?SERVER, get_settings). 51 | 52 | set_settings(Settings) -> 53 | gen_server:call(?SERVER, {set_settings, Settings}). 54 | 55 | set_local_settings(Settings) -> 56 | gen_server:call(?SERVER, {set_local_settings, Settings}). 57 | 58 | %% callbacks 59 | init([]) -> 60 | _ = ets:new(?TABLE, [protected, named_table, 61 | {read_concurrency, true}, 62 | {write_concurrency, true}]), 63 | 64 | LocalSettings = 65 | case application:get_env(chronicle, settings) of 66 | {ok, Settings} -> 67 | Settings; 68 | undefined -> 69 | #{} 70 | end, 71 | 72 | State = handle_new_settings(#{}, LocalSettings, #state{}), 73 | 74 | {ok, State}. 75 | 76 | handle_call(get_settings, _From, #state{effective_settings = 77 | EffectiveSettings} = State) -> 78 | {reply, EffectiveSettings, State}; 79 | handle_call({set_settings, Settings}, _From, 80 | #state{local_settings = LocalSettings} = State) -> 81 | NewState = handle_new_settings(Settings, LocalSettings, State), 82 | {reply, ok, NewState}; 83 | handle_call({set_local_settings, LocalSettings}, _From, 84 | #state{settings = Settings} = State) -> 85 | NewState = handle_new_settings(Settings, LocalSettings, State), 86 | {reply, ok, NewState}; 87 | handle_call(_Call, _From, State) -> 88 | {reply, nack, State}. 89 | 90 | handle_cast(Cast, State) -> 91 | ?WARNING("Unexpected cast:~n~p", [Cast]), 92 | {noreply, State}. 93 | 94 | %% internal 95 | handle_new_settings(Settings, LocalSettings, 96 | #state{effective_settings = OldEffectiveSettings} = 97 | State) -> 98 | NewEffectiveSettings = maps:merge(Settings, LocalSettings), 99 | {ToDelete, ToSet} = 100 | diff_settings(OldEffectiveSettings, NewEffectiveSettings), 101 | 102 | lists:foreach( 103 | fun (Name) -> 104 | ets:delete(?TABLE, Name) 105 | end, ToDelete), 106 | 107 | ets:insert(?TABLE, ToSet), 108 | 109 | State#state{settings = Settings, 110 | local_settings = LocalSettings, 111 | effective_settings = NewEffectiveSettings}. 112 | 113 | diff_settings(OldSettings, NewSettings) -> 114 | ToDelete = 115 | maps:fold( 116 | fun (Name, _, Acc) -> 117 | case maps:is_key(Name, NewSettings) of 118 | true -> 119 | Acc; 120 | false -> 121 | [Name | Acc] 122 | end 123 | end, [], OldSettings), 124 | 125 | ToSet = 126 | maps:fold( 127 | fun (Name, Value, Acc) -> 128 | case maps:find(Name, OldSettings) of 129 | {ok, OldValue} 130 | when OldValue =:= Value -> 131 | Acc; 132 | _ -> 133 | [{Name, Value} | Acc] 134 | end 135 | end, [], NewSettings), 136 | 137 | {ToDelete, ToSet}. 138 | 139 | -ifdef(TEST). 140 | diff_settings_test() -> 141 | OldSettings = #{a => 42, c => fortytwo}, 142 | NewSettings = #{a => 43, d => fortythree}, 143 | 144 | {ToDelete, ToSet} = diff_settings(OldSettings, NewSettings), 145 | 146 | ?assertEqual([c], ToDelete), 147 | ?assertEqual([{a, 43}, {d, fortythree}], lists:sort(ToSet)). 148 | -endif. 149 | -------------------------------------------------------------------------------- /src/chronicle_single_rsm_sup.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2020 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | %% A supervisor for processes that require the system to be provisioned to 17 | %% run. 18 | -module(chronicle_single_rsm_sup). 19 | 20 | -behavior(supervisor). 21 | -export([start_link/4]). 22 | -export([init/1]). 23 | 24 | start_link(Name, PeerId, Module, Args) -> 25 | supervisor:start_link(?MODULE, [Name, PeerId, Module, Args]). 26 | 27 | %% supervisor callbacks 28 | init([Name, PeerId, Module, Args]) -> 29 | Flags = #{strategy => one_for_all, 30 | intensity => 10, 31 | period => 10}, 32 | %% TODO: make this optional 33 | ExtraSpecs = Module:specs(Name, Args), 34 | RSM = #{id => Name, 35 | start => {chronicle_rsm, start_link, [Name, PeerId, Module, Args]}, 36 | restart => permanent, 37 | shutdown => 5000, 38 | type => worker}, 39 | 40 | {ok, {Flags, ExtraSpecs ++ [RSM]}}. 41 | -------------------------------------------------------------------------------- /src/chronicle_snapshot_mgr.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2021 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | -module(chronicle_snapshot_mgr). 17 | 18 | -include("chronicle.hrl"). 19 | 20 | -export([start_link/0]). 21 | -export([get_latest_snapshot/0, release_snapshot/1, store_snapshot/1]). 22 | -export([pending_snapshot/2, cancel_pending_snapshot/1, save_snapshot/3]). 23 | -export([need_snapshot/1, wipe/0]). 24 | 25 | -export([init/1, handle_call/3, handle_cast/2, handle_info/2]). 26 | 27 | -import(chronicle_utils, [sanitize_stacktrace/1]). 28 | 29 | -define(SERVER, ?SERVER_NAME(?MODULE)). 30 | -define(TIMEOUT, chronicle_settings:get({snapshot_mgr, timeout}, 15000)). 31 | 32 | -record(pending_snapshot, { seqno, 33 | savers, 34 | remaining_rsms }). 35 | 36 | -record(state, { snapshot, 37 | 38 | monitors, 39 | snapshot_readers, 40 | 41 | pending_snapshot 42 | }). 43 | 44 | start_link() -> 45 | gen_server:start_link(?START_NAME(?MODULE), ?MODULE, [], []). 46 | 47 | get_latest_snapshot() -> 48 | gen_server:call(?SERVER, {get_latest_snapshot, self()}, ?TIMEOUT). 49 | 50 | release_snapshot(Ref) -> 51 | gen_server:cast(?SERVER, {release_snapshot, Ref}). 52 | 53 | store_snapshot(Snapshot) -> 54 | gen_server:cast(?SERVER, {store_snapshot, Snapshot}). 55 | 56 | pending_snapshot(Seqno, RSMs) -> 57 | gen_server:cast(?SERVER, {pending_snapshot, Seqno, RSMs}). 58 | 59 | cancel_pending_snapshot(Seqno) -> 60 | case gen_server:call(?SERVER, {cancel_pending_snapshot, Seqno}, ?TIMEOUT) of 61 | ok -> 62 | ok; 63 | {error, Error} -> 64 | exit(Error) 65 | end. 66 | 67 | save_snapshot(Name, Seqno, Snapshot) -> 68 | Pid = self(), 69 | case gen_server:call(?SERVER, 70 | {get_snapshot_saver, Name, Pid, Seqno}, 71 | ?TIMEOUT) of 72 | {ok, SaverPid} -> 73 | SaverPid ! {snapshot, Snapshot}; 74 | {error, rejected} -> 75 | ?INFO("Snapshot for RSM ~p at " 76 | "sequence number ~p got rejected.", [Name, Seqno]) 77 | end. 78 | 79 | need_snapshot(Name) -> 80 | gen_server:call(?SERVER, {need_snapshot, Name}, ?TIMEOUT). 81 | 82 | wipe() -> 83 | ok = gen_server:call(?SERVER, wipe, ?TIMEOUT). 84 | 85 | %% callbacks 86 | init([]) -> 87 | {ok, #state{snapshot = undefined, 88 | monitors = #{}, 89 | snapshot_readers = #{}}}. 90 | 91 | handle_call({get_latest_snapshot, Pid}, _From, State) -> 92 | handle_get_latest_snapshot(Pid, State); 93 | handle_call({get_snapshot_saver, RSM, RSMPid, Seqno}, _From, State) -> 94 | handle_get_snapshot_saver(RSM, RSMPid, Seqno, State); 95 | handle_call({cancel_pending_snapshot, Seqno}, _From, State) -> 96 | handle_cancel_pending_snapshot(Seqno, State); 97 | handle_call({need_snapshot, RSM}, _From, State) -> 98 | {reply, need_snapshot(RSM, State), State}; 99 | handle_call(wipe, _From, State) -> 100 | handle_wipe(State); 101 | handle_call(_Call, _From, State) -> 102 | {reply, nack, State}. 103 | 104 | handle_cast({store_snapshot, Snapshot}, State) -> 105 | handle_store_snapshot(Snapshot, State); 106 | handle_cast({release_snapshot, Ref}, State) -> 107 | handle_release_snapshot(Ref, State); 108 | handle_cast({pending_snapshot, Seqno, RSMs}, State) -> 109 | handle_pending_snapshot(Seqno, RSMs, State); 110 | handle_cast(Cast, State) -> 111 | ?WARNING("Unexpected cast: ~w", [Cast]), 112 | {noreply, State}. 113 | 114 | handle_info({snapshot_result, RSM, Pid, Result}, State) -> 115 | handle_snapshot_result(RSM, Pid, Result, State); 116 | handle_info({'DOWN', MRef, process, _Pid, _Reason}, State) -> 117 | handle_down(MRef, State); 118 | handle_info(Msg, State) -> 119 | ?WARNING("Unexpected message: ~w", [Msg]), 120 | {noreply, State}. 121 | 122 | %% internal 123 | handle_get_latest_snapshot(Pid, #state{snapshot = Snapshot} = State) -> 124 | case Snapshot of 125 | undefined -> 126 | {reply, {error, no_snapshot}, State}; 127 | {Seqno, HistoryId, Term, Config} -> 128 | MRef = erlang:monitor(process, Pid), 129 | Reply = {ok, MRef, Seqno, HistoryId, Term, Config}, 130 | 131 | {reply, Reply, add_reader(MRef, Seqno, State)} 132 | end. 133 | 134 | handle_wipe(#state{monitors = Monitors, 135 | pending_snapshot = Snapshot} = State) -> 136 | %% chronicle_agent should cancel snapshots before calling wipe() 137 | undefined = Snapshot, 138 | 139 | %% All readers should be stopped by now, but it's possible that DOWN 140 | %% messages haven't gotten delivered yet. 141 | lists:foreach( 142 | fun (MRef) -> 143 | erlang:demonitor(MRef, [flush]) 144 | end, maps:keys(Monitors)), 145 | 146 | {reply, ok, State#state{snapshot = undefined, 147 | monitors = #{}, 148 | snapshot_readers = #{}}}. 149 | 150 | handle_store_snapshot(Snapshot, #state{snapshot = OldSnapshot} = State) -> 151 | {_Seqno, _HistoryId, _Term, _Config} = Snapshot, 152 | true = (Snapshot =/= OldSnapshot), 153 | 154 | NewState = State#state{snapshot = Snapshot}, 155 | case OldSnapshot of 156 | undefined -> 157 | ok; 158 | {OldSeqno, _, _, _} -> 159 | maybe_release_snapshot(OldSeqno, NewState) 160 | end, 161 | 162 | {noreply, NewState}. 163 | 164 | handle_release_snapshot(MRef, State) -> 165 | erlang:demonitor(MRef, [flush]), 166 | {noreply, remove_reader(MRef, State)}. 167 | 168 | handle_pending_snapshot(Seqno, RSMs, State) -> 169 | undefined = State#state.pending_snapshot, 170 | Snapshot = #pending_snapshot{ 171 | seqno = Seqno, 172 | savers = #{}, 173 | remaining_rsms = sets:from_list(RSMs)}, 174 | 175 | {noreply, State#state{pending_snapshot = Snapshot}}. 176 | 177 | handle_cancel_pending_snapshot(Seqno, 178 | #state{pending_snapshot = Snapshot} = State) -> 179 | case Snapshot of 180 | undefined -> 181 | {reply, ok, State}; 182 | #pending_snapshot{seqno = SnapshotSeqno, savers = Savers} 183 | when SnapshotSeqno =:= Seqno -> 184 | ?DEBUG("Canceling snapshot at seqno ~p", [Seqno]), 185 | cancel_savers(Savers), 186 | 187 | {reply, ok, State#state{pending_snapshot = undefined}}; 188 | #pending_snapshot{seqno = SnapshotSeqno} -> 189 | {reply, {error, {bad_snapshot, Seqno, SnapshotSeqno}}, State} 190 | end. 191 | 192 | handle_get_snapshot_saver(RSM, RSMPid, Seqno, State) -> 193 | case need_snapshot(RSM, State) of 194 | {true, NeededSeqno} -> 195 | %% It's a bug if an RSM comes to us with a snapshot we're not yet 196 | %% aware of 197 | true = (NeededSeqno >= Seqno), 198 | 199 | case NeededSeqno =:= Seqno of 200 | true -> 201 | Snapshot = State#state.pending_snapshot, 202 | {Pid, NewSnapshot} = 203 | spawn_snapshot_saver(RSM, RSMPid, Snapshot), 204 | {reply, {ok, Pid}, 205 | State#state{pending_snapshot = NewSnapshot}}; 206 | false -> 207 | {reply, {error, rejected}, State} 208 | end; 209 | false -> 210 | {reply, {error, rejected}, State} 211 | end. 212 | 213 | spawn_snapshot_saver(RSM, RSMPid, 214 | #pending_snapshot{seqno = Seqno, 215 | savers = Savers, 216 | remaining_rsms = RSMs} = Snapshot) -> 217 | Parent = self(), 218 | Pid = proc_lib:spawn_link( 219 | fun () -> 220 | Result = 221 | try snapshot_saver(RSM, RSMPid, Seqno) of 222 | R -> 223 | R 224 | catch 225 | T:E:Stacktrace -> 226 | ?ERROR("Exception while taking " 227 | "snapshot for RSM ~p~p at seqno ~p: ~p~n" 228 | "Stacktrace:~n~p", 229 | [RSM, RSMPid, Seqno, {T, E}, 230 | sanitize_stacktrace(Stacktrace)]), 231 | failed 232 | end, 233 | 234 | %% Make sure to change flush_snapshot_results() if the 235 | %% format of the message is modified. 236 | Parent ! {snapshot_result, RSM, self(), Result} 237 | end), 238 | 239 | {Pid, Snapshot#pending_snapshot{savers = Savers#{Pid => RSM}, 240 | remaining_rsms = 241 | sets:del_element(RSM, RSMs)}}. 242 | 243 | flush_snapshot_results() -> 244 | ?FLUSH({snapshot_result, _, _, _}), 245 | ok. 246 | 247 | snapshot_saver(RSM, RSMPid, Seqno) -> 248 | MRef = erlang:monitor(process, RSMPid), 249 | 250 | receive 251 | {snapshot, Snapshot} -> 252 | chronicle_storage:save_rsm_snapshot(Seqno, RSM, Snapshot); 253 | {'DOWN', MRef, process, RSMPid, _Reason} -> 254 | ?ERROR("RSM ~p~p died " 255 | "before passing a snapshot for seqno ~p.", 256 | [RSM, RSMPid, Seqno]), 257 | failed 258 | end. 259 | 260 | need_snapshot(RSM, #state{pending_snapshot = Snapshot}) -> 261 | case Snapshot of 262 | undefined -> 263 | false; 264 | #pending_snapshot{seqno = SnapshotSeqno, 265 | remaining_rsms = RSMs} -> 266 | case sets:is_element(RSM, RSMs) of 267 | true -> 268 | {true, SnapshotSeqno}; 269 | false -> 270 | false 271 | end 272 | end. 273 | 274 | handle_snapshot_result(RSM, Pid, Result, State) -> 275 | case Result of 276 | ok -> 277 | handle_snapshot_ok(RSM, Pid, State); 278 | failed -> 279 | handle_snapshot_failed(RSM, State) 280 | end. 281 | 282 | handle_snapshot_ok(RSM, Pid, #state{pending_snapshot = Snapshot} = State) -> 283 | #pending_snapshot{seqno = Seqno, 284 | savers = Savers, 285 | remaining_rsms = Remaining} = Snapshot, 286 | 287 | ?DEBUG("Saved snapshot for RSM ~p at seqno ~p", [RSM, Seqno]), 288 | 289 | NewSavers = maps:remove(Pid, Savers), 290 | case maps:size(NewSavers) =:= 0 andalso sets:is_empty(Remaining) of 291 | true -> 292 | ?DEBUG("All RSM snapshots at seqno ~p saved.", [Seqno]), 293 | 294 | chronicle_agent:snapshot_ok(Seqno), 295 | {noreply, State#state{pending_snapshot = undefined}}; 296 | false -> 297 | NewSnapshot = Snapshot#pending_snapshot{savers = NewSavers}, 298 | {noreply, State#state{pending_snapshot = NewSnapshot}} 299 | end. 300 | 301 | handle_snapshot_failed(RSM, #state{pending_snapshot = Snapshot} = State) -> 302 | #pending_snapshot{seqno = Seqno, savers = Savers} = Snapshot, 303 | 304 | ?ERROR("Aborting snapshot at seqno ~b " 305 | "because RSM ~p failed to take its snapshot", 306 | [Seqno, RSM]), 307 | 308 | cancel_savers(Savers), 309 | chronicle_agent:snapshot_failed(Seqno), 310 | {noreply, State#state{pending_snapshot = undefined}}. 311 | 312 | cancel_savers(Savers) -> 313 | chronicle_utils:maps_foreach( 314 | fun (Pid, _RSM) -> 315 | chronicle_utils:terminate_linked_process(Pid, kill) 316 | end, Savers), 317 | flush_snapshot_results(). 318 | 319 | handle_down(MRef, State) -> 320 | {noreply, remove_reader(MRef, State)}. 321 | 322 | add_reader(MRef, Seqno, #state{monitors = Monitors, 323 | snapshot_readers = SnapshotReaders} = State) -> 324 | NewMonitors = maps:put(MRef, Seqno, Monitors), 325 | NewSnapshotReaders = maps:update_with(Seqno, 326 | fun (V) -> V + 1 end, 1, 327 | SnapshotReaders), 328 | State#state{monitors = NewMonitors, snapshot_readers = NewSnapshotReaders}. 329 | 330 | remove_reader(MRef, #state{monitors = Monitors, 331 | snapshot_readers = SnapshotReaders} = State) -> 332 | case maps:take(MRef, Monitors) of 333 | {Seqno, NewMonitors} -> 334 | NewNumReaders = maps:get(Seqno, SnapshotReaders) - 1, 335 | NewSnapshotReaders = 336 | case NewNumReaders > 0 of 337 | true -> 338 | maps:put(Seqno, NewNumReaders, SnapshotReaders); 339 | false -> 340 | maps:remove(Seqno, SnapshotReaders) 341 | end, 342 | 343 | NewState = State#state{monitors = NewMonitors, 344 | snapshot_readers = NewSnapshotReaders}, 345 | maybe_release_snapshot(Seqno, NewState), 346 | NewState; 347 | error -> 348 | State 349 | end. 350 | 351 | maybe_release_snapshot(Seqno, State) -> 352 | case can_release_snapshot(Seqno, State) of 353 | true -> 354 | chronicle_agent:release_snapshot(Seqno); 355 | false -> 356 | ok 357 | end. 358 | 359 | can_release_snapshot(Seqno, State) -> 360 | is_stale_snapshot(Seqno, State) andalso no_readers(Seqno, State). 361 | 362 | is_stale_snapshot(Seqno, #state{snapshot = {SnapshotSeqno, _, _, _}}) -> 363 | Seqno =/= SnapshotSeqno. 364 | 365 | no_readers(Seqno, #state{snapshot_readers = SnapshotReaders}) -> 366 | not maps:is_key(Seqno, SnapshotReaders). 367 | -------------------------------------------------------------------------------- /src/chronicle_stats.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2021 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | -module(chronicle_stats). 17 | 18 | -include("chronicle.hrl"). 19 | 20 | -export([report_histo/4, report_counter/2, report_gauge/2, report_max/4]). 21 | -export([ignore_stats/1]). 22 | 23 | report_histo(Metric, Max, Unit, Value) -> 24 | report({histo, Metric, Max, Unit, Value}). 25 | 26 | report_counter(Metric, By) -> 27 | report({counter, Metric, By}). 28 | 29 | report_gauge(Metric, Value) -> 30 | report({gauge, Metric, Value}). 31 | 32 | report_max(Metric, Window, Bucket, Value) -> 33 | report({max, Metric, Window, Bucket, Value}). 34 | 35 | report(Event) -> 36 | try 37 | (persistent_term:get(?CHRONICLE_STATS))(Event) 38 | catch 39 | T:E -> 40 | ?ERROR("Failed to report stats ~w: ~w", [Event, {T, E}]) 41 | end. 42 | 43 | ignore_stats(_) -> 44 | ok. 45 | -------------------------------------------------------------------------------- /src/chronicle_status.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2021 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | -module(chronicle_status). 17 | 18 | -include("chronicle.hrl"). 19 | 20 | -behavior(gen_server). 21 | 22 | -export([start_link/0]). 23 | -export([init/1, handle_call/3, handle_cast/2, handle_info/2]). 24 | 25 | -export([get_cluster_status/0, get_peers/0]). 26 | 27 | -export_type([cluster_status/0, peer_statuses/0]). 28 | 29 | -define(SERVER, ?SERVER_NAME(?MODULE)). 30 | -define(SERVER(Peer), ?SERVER_NAME(Peer, ?MODULE)). 31 | 32 | -define(PING_INTERVAL, 33 | chronicle_settings:get({status, ping_interval}, 3000)). 34 | -define(WAIT_MORE_STATUS_TIMEOUT, 35 | chronicle_settings:get({status, wait_more_status_timeout}, 100)). 36 | 37 | -record(state, { local_status, 38 | cluster_status, 39 | 40 | last_heard, 41 | statuses, 42 | 43 | wait_more_status_tref }). 44 | 45 | start_link() -> 46 | gen_server:start_link(?START_NAME(?MODULE), ?MODULE, [], []). 47 | 48 | -type cluster_status() :: #{histories => cluster_status_histories(), 49 | failovers => cluster_status_failovers()}. 50 | -type cluster_status_histories() :: 51 | #{chronicle:history_id() => [chronicle:peer()]}. 52 | -type cluster_status_failovers() :: 53 | #{FailoverId::chronicle:history_id() => cluster_status_failover()}. 54 | -type cluster_status_failover() :: 55 | #{old_history_id := chronicle:history_id(), 56 | new_history_id := chronicle:history_id(), 57 | peers := [chronicle:peer()], 58 | status := 59 | #{chronicle:peer() => 60 | pending | started | done | diverged | conflict | unknown}}. 61 | 62 | -spec get_cluster_status() -> cluster_status(). 63 | get_cluster_status() -> 64 | gen_server:call(?SERVER, get_cluster_status). 65 | 66 | -type peer_status() :: #{since_heard => Millis::non_neg_integer()}. 67 | -type peer_statuses() :: #{chronicle:peer() => peer_status()}. 68 | 69 | -spec get_peers() -> peer_statuses(). 70 | get_peers() -> 71 | gen_server:call(?SERVER, get_peers). 72 | 73 | %% callbacks 74 | init([]) -> 75 | chronicle_peers:monitor(), 76 | request_status_all(), 77 | 78 | Self = self(), 79 | chronicle_events:subscribe( 80 | fun (Event) -> 81 | case is_interesting_event(Event) of 82 | true -> 83 | Self ! refresh_status; 84 | false -> 85 | ok 86 | end 87 | end), 88 | 89 | State0 = #state{local_status = local_status(), 90 | last_heard = #{}, 91 | statuses = #{}, 92 | wait_more_status_tref = undefined}, 93 | State = recompute_cluster_status(State0), 94 | announce_cluster_status_changed(), 95 | 96 | send_status_all(State), 97 | schedule_ping(), 98 | 99 | {ok, State}. 100 | 101 | handle_call(get_cluster_status, _From, 102 | #state{cluster_status = Status} = State) -> 103 | {reply, Status, State}; 104 | handle_call(get_peers, _From, #state{last_heard = LastHeard} = State) -> 105 | Now = get_timestamp(), 106 | Reply = maps:map( 107 | fun (_Peer, HeardTS) -> 108 | SinceHeard = 109 | erlang:convert_time_unit(Now - HeardTS, 110 | native, millisecond), 111 | #{since_heard => SinceHeard} 112 | end, LastHeard#{?PEER() => Now}), 113 | {reply, Reply, State}; 114 | handle_call(_Call, _From, State) -> 115 | {reply, nack, State}. 116 | 117 | handle_cast(Cast, State) -> 118 | ?WARNING("Unexpected cast:~n~p", [Cast]), 119 | {noreply, State}. 120 | 121 | handle_info(refresh_status = Msg, State) -> 122 | ?FLUSH(Msg), 123 | handle_refresh_status(State); 124 | handle_info(send_status, State) -> 125 | handle_send_status(State); 126 | handle_info({request_status, Peer} = Msg, State) -> 127 | ?FLUSH(Msg), 128 | handle_request_status(Peer, State); 129 | handle_info(send_ping, State) -> 130 | handle_send_ping(State); 131 | handle_info({ping, Peer} = Msg, State) -> 132 | ?FLUSH(Msg), 133 | handle_ping(Peer, State); 134 | handle_info({status, Peer, Status}, State) -> 135 | ?FLUSH({status, Peer, _}), 136 | handle_status(Peer, Status, State); 137 | handle_info({nodeup, Node, []}, State) -> 138 | handle_nodeup(Node, State); 139 | handle_info({nodedown, Node, _}, State) -> 140 | handle_nodedown(Node, State); 141 | handle_info(Msg, State) -> 142 | ?WARNING("Unexpected message:~n~p", [Msg]), 143 | {noreply, State}. 144 | 145 | %% internal 146 | is_interesting_event(Event) -> 147 | case Event of 148 | {new_history, _, _} -> 149 | true; 150 | {new_config, _, _} -> 151 | true; 152 | _ -> 153 | false 154 | end. 155 | 156 | handle_refresh_status(#state{local_status = OldStatus} = State) -> 157 | NewStatus = local_status(), 158 | case NewStatus =:= OldStatus of 159 | true -> 160 | {noreply, State}; 161 | false -> 162 | NewState = recompute_cluster_status( 163 | State#state{local_status = NewStatus}), 164 | {noreply, maybe_schedule_send_status(NewState)} 165 | end. 166 | 167 | handle_send_status(State) -> 168 | send_status_all(State), 169 | {noreply, State#state{wait_more_status_tref = undefined}}. 170 | 171 | handle_request_status(Peer, State) -> 172 | send_status(Peer, State), 173 | {noreply, State}. 174 | 175 | handle_send_ping(State) -> 176 | schedule_ping(), 177 | send_ping(), 178 | {noreply, State}. 179 | 180 | schedule_ping() -> 181 | erlang:send_after(?PING_INTERVAL, self(), send_ping). 182 | 183 | handle_ping(Peer, #state{last_heard = LastHeard} = State) -> 184 | Now = get_timestamp(), 185 | NewLastHeard = maps:put(Peer, Now, LastHeard), 186 | {noreply, State#state{last_heard = NewLastHeard}}. 187 | 188 | handle_status(Peer, Status, #state{last_heard = LastHeard, 189 | statuses = Statuses} = State) -> 190 | Now = get_timestamp(), 191 | NewLastHeard = maps:put(Peer, Now, LastHeard), 192 | 193 | NewState0 = State#state{last_heard = NewLastHeard}, 194 | NewState = 195 | case maps:find(Peer, Statuses) of 196 | {ok, OldStatus} when Status =:= OldStatus -> 197 | NewState0; 198 | _ -> 199 | NewStatuses = maps:put(Peer, Status, Statuses), 200 | recompute_cluster_status( 201 | NewState0#state{statuses = NewStatuses}) 202 | end, 203 | 204 | {noreply, NewState}. 205 | 206 | handle_nodeup(Peer, State) -> 207 | %% Try not to request a status from ourselves. This is not 100% 208 | %% bullet-proof if there's a burst of renames. But everything should 209 | %% converge to a stable state anyway. 210 | case Peer =/= ?PEER() of 211 | true -> 212 | request_status(Peer); 213 | false -> 214 | ok 215 | end, 216 | {noreply, State}. 217 | 218 | handle_nodedown(Peer, #state{last_heard = LastHeard, 219 | statuses = Statuses} = State) -> 220 | NewLastHeard = maps:remove(Peer, LastHeard), 221 | NewStatuses = maps:remove(Peer, Statuses), 222 | NewState = State#state{last_heard = NewLastHeard, statuses = NewStatuses}, 223 | 224 | {noreply, recompute_cluster_status(NewState)}. 225 | 226 | get_timestamp() -> 227 | erlang:monotonic_time(). 228 | 229 | local_status() -> 230 | Metadata = chronicle_agent:get_metadata(), 231 | #metadata{pending_branch = Branch, history_id = HistoryId} = Metadata, 232 | 233 | #{history_id => HistoryId, branch => branch_status(Branch)}. 234 | 235 | branch_status(undefined) -> 236 | no_branch; 237 | branch_status(#branch{history_id = NewHistoryId, 238 | old_history_id = OldHistoryId, 239 | peers = Peers}) -> 240 | #{old_history_id => OldHistoryId, 241 | new_history_id => NewHistoryId, 242 | peers => Peers}. 243 | 244 | request_status_all() -> 245 | request_status(live_peers()). 246 | 247 | request_status(Peers) -> 248 | send_to(Peers, {request_status, ?PEER()}). 249 | 250 | send_ping() -> 251 | send_all({ping, ?PEER()}). 252 | 253 | maybe_schedule_send_status(#state{wait_more_status_tref = TRef} = State) -> 254 | case TRef of 255 | undefined -> 256 | NewTRef = erlang:send_after(?WAIT_MORE_STATUS_TIMEOUT, 257 | self(), send_status), 258 | State#state{wait_more_status_tref = NewTRef}; 259 | _ -> 260 | State 261 | end. 262 | 263 | send_status_all(State) -> 264 | send_status(live_peers(), State). 265 | 266 | send_status(Peers, #state{local_status = Status}) -> 267 | send_to(Peers, {status, ?PEER(), Status}). 268 | 269 | send_all(Msg) -> 270 | send_to(live_peers(), Msg). 271 | 272 | send_to(Peer, Msg) when is_atom(Peer) -> 273 | send_to([Peer], Msg); 274 | send_to(Peers, Msg) when is_list(Peers) -> 275 | lists:foreach( 276 | fun (Peer) -> 277 | chronicle_utils:send(?SERVER(Peer), Msg, 278 | [nosuspend, noconnect]) 279 | end, Peers). 280 | 281 | live_peers() -> 282 | chronicle_peers:get_live_peers_other(). 283 | 284 | recompute_cluster_status(#state{local_status = LocalStatus, 285 | cluster_status = OldClusterStatus, 286 | statuses = PeerStatuses} = State) -> 287 | AllStatuses = maps:put(?PEER(), LocalStatus, PeerStatuses), 288 | NewClusterStatus = cluster_status(AllStatuses), 289 | 290 | case OldClusterStatus =:= NewClusterStatus of 291 | true -> 292 | State; 293 | false -> 294 | announce_cluster_status_changed(), 295 | State#state{cluster_status = NewClusterStatus} 296 | end. 297 | 298 | announce_cluster_status_changed() -> 299 | chronicle_utils:announce_important_change(cluster_status). 300 | 301 | cluster_status(Statuses) -> 302 | Failovers = aggregate_failovers(Statuses), 303 | Histories = aggregate_histories(Statuses), 304 | 305 | #{failovers => Failovers, histories => Histories}. 306 | 307 | aggregate_failovers(Statuses) -> 308 | maps:fold( 309 | fun (_Peer, PeerStatus, Acc) -> 310 | case maps:find(branch, PeerStatus) of 311 | {ok, #{old_history_id := OldHistoryId, 312 | new_history_id := NewHistoryId, 313 | peers := Peers} = Branch} -> 314 | 315 | case maps:is_key(NewHistoryId, Acc) of 316 | true -> 317 | Acc; 318 | false -> 319 | FailoverStatus = 320 | failover_status(Peers, OldHistoryId, 321 | NewHistoryId, Statuses), 322 | Acc#{NewHistoryId => 323 | Branch#{status => FailoverStatus}} 324 | end; 325 | _ -> 326 | Acc 327 | end 328 | end, #{}, Statuses). 329 | 330 | failover_status(Peers, OldHistoryId, NewHistoryId, PeerStatuses) -> 331 | lists:foldl( 332 | fun (Peer, Acc) -> 333 | Status = failover_peer_status(Peer, OldHistoryId, 334 | NewHistoryId, PeerStatuses), 335 | Acc#{Peer => Status} 336 | end, #{}, lists:sort(Peers)). 337 | 338 | failover_peer_status(Peer, OldHistoryId, NewHistoryId, PeerStatuses) -> 339 | case maps:find(Peer, PeerStatuses) of 340 | {ok, #{history_id := PeerHistoryId, 341 | branch := PeerBranch}} -> 342 | case PeerBranch of 343 | no_branch -> 344 | if 345 | PeerHistoryId =:= NewHistoryId -> 346 | done; 347 | PeerHistoryId =:= OldHistoryId -> 348 | pending; 349 | true -> 350 | diverged 351 | end; 352 | _ -> 353 | case PeerBranch of 354 | #{new_history_id := BranchHistoryId} -> 355 | case BranchHistoryId =:= NewHistoryId of 356 | true -> 357 | started; 358 | false -> 359 | conflict 360 | end; 361 | _ -> 362 | unknown 363 | end 364 | end; 365 | _ -> 366 | unknown 367 | end. 368 | 369 | aggregate_histories(Statuses) -> 370 | Histories = maps:fold( 371 | fun (Peer, PeerStatus, Acc) -> 372 | update_histories(Peer, PeerStatus, Acc) 373 | end, #{}, Statuses), 374 | maps:map( 375 | fun (_HistoryId, Peers) -> 376 | lists:sort(Peers) 377 | end, Histories). 378 | 379 | update_histories(Peer, PeerStatus, Acc) -> 380 | case maps:find(history_id, PeerStatus) of 381 | {ok, HistoryId} -> 382 | maps:update_with( 383 | HistoryId, 384 | fun (Peers) -> 385 | [Peer | Peers] 386 | end, [Peer], Acc); 387 | error -> 388 | Acc 389 | end. 390 | -------------------------------------------------------------------------------- /src/chronicle_sup.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2020 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | -module(chronicle_sup). 17 | 18 | -behavior(supervisor). 19 | 20 | -include("chronicle.hrl"). 21 | 22 | -export([start_link/0]). 23 | -export([init/1]). 24 | 25 | start_link() -> 26 | supervisor:start_link(?START_NAME(?MODULE), ?MODULE, []). 27 | 28 | %% callbacks 29 | init([]) -> 30 | %% TODO: reconsider the strategy 31 | Flags = #{strategy => rest_for_one, 32 | intensity => 3, 33 | period => 10}, 34 | {ok, {Flags, child_specs()}}. 35 | 36 | %% TODO: revise shutdown specifications 37 | child_specs() -> 38 | Peers = #{id => chronicle_peers, 39 | start => {chronicle_peers, start_link, []}, 40 | restart => permanent, 41 | shutdown => brutal_kill, 42 | type => worker}, 43 | 44 | Events = #{id => chronicle_events, 45 | start => {chronicle_events, start_link, []}, 46 | restart => permanent, 47 | shutdown => 1000, 48 | type => worker}, 49 | 50 | ExtEvents = #{id => chronicle_external_events, 51 | start => {gen_event, start_link, 52 | [?START_NAME(?EXTERNAL_EVENTS)]}, 53 | restart => permanent, 54 | shutdown => brutal_kill, 55 | type => worker}, 56 | 57 | Ets = #{id => chronicle_ets, 58 | start => {chronicle_ets, start_link, []}, 59 | restart => permanent, 60 | shutdown => brutal_kill, 61 | type => worker}, 62 | 63 | Settings = #{id => chronicle_settings, 64 | start => {chronicle_settings, start_link, []}, 65 | restart => permanent, 66 | shutdown => brutal_kill, 67 | type => worker}, 68 | 69 | AgentSup = #{id => chronicle_agent_sup, 70 | start => {chronicle_agent_sup, start_link, []}, 71 | restart => permanent, 72 | type => supervisor}, 73 | 74 | SecondarySup = #{id => chronicle_secondary_sup, 75 | start => {chronicle_secondary_sup, start_link, []}, 76 | restart => permanent, 77 | type => supervisor}, 78 | 79 | [Peers, Events, ExtEvents, Ets, Settings, AgentSup, SecondarySup]. 80 | -------------------------------------------------------------------------------- /src/dynamic_supervisor.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2020 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | -module(dynamic_supervisor). 17 | 18 | -include("chronicle.hrl"). 19 | 20 | %% This module implements both supervisor and gen_server behaviors. But if I 21 | %% uncomment the following line, this causes a "callback conflict" warning to 22 | %% be emitted. 23 | %% 24 | %% -behavior(supervisor). 25 | -behavior(gen_server). 26 | 27 | -export([start_link/2, start_link/3, sync/2, send_event/2]). 28 | -export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2]). 29 | 30 | -record(state, {supervisor :: undefined | pid(), 31 | mod :: module(), 32 | mod_state :: term(), 33 | child_specs :: list()}). 34 | 35 | -callback init(Args :: term()) -> 36 | {ok, SupFlags :: supervisor:sup_flags(), State :: term()} | 37 | {stop, Reason :: term()} | 38 | ignore. 39 | -callback child_specs(State :: term()) -> 40 | [ChildSpec :: supervisor:child_spec()]. 41 | -callback handle_event(Event :: term(), State :: term()) -> 42 | {noreply, NewState :: term()} | 43 | {stop, Reason :: term()}. 44 | 45 | start_link(Module, Args) -> 46 | gen_server:start_link(?MODULE, {gen_server, Module, Args}, []). 47 | 48 | start_link(Name, Module, Args) -> 49 | gen_server:start_link(Name, ?MODULE, {gen_server, Module, Args}, []). 50 | 51 | sync(ServerRef, Timeout) -> 52 | gen_server:call(ServerRef, sync, Timeout). 53 | 54 | send_event(ServerRef, Event) -> 55 | gen_server:cast(ServerRef, {event, Event}). 56 | 57 | %% callbacks 58 | init({supervisor, Flags}) -> 59 | {ok, {Flags, []}}; 60 | init({gen_server, Module, Args}) -> 61 | process_flag(trap_exit, true), 62 | case Module:init(Args) of 63 | {ok, Flags, ModState} -> 64 | %% There's no simple way to enforce the order of children that are 65 | %% started dynamically. So only one_for_all and one_for_one 66 | %% restart strategies are supported, where the order doesn't 67 | %% matter. 68 | Strategy = maps:get(strategy, Flags), 69 | true = lists:member(Strategy, [one_for_all, one_for_one]), 70 | 71 | case supervisor:start_link(?MODULE, {supervisor, Flags}) of 72 | {ok, Pid} -> 73 | State = #state{supervisor = Pid, 74 | mod = Module, 75 | mod_state = ModState, 76 | child_specs = []}, 77 | {ok, manage_children(State)}; 78 | Other -> 79 | {stop, {failed_to_start_sup, Other}} 80 | end; 81 | ignore -> 82 | ignore; 83 | {stop, _} = Stop -> 84 | Stop 85 | end. 86 | 87 | handle_call(sync, _From, State) -> 88 | {reply, ok, State}. 89 | 90 | handle_cast({event, Event}, #state{mod = Module, 91 | mod_state = ModState} = State) -> 92 | case Module:handle_event(Event, ModState) of 93 | {noreply, NewModState} -> 94 | {noreply, handle_mod_state(NewModState, State)}; 95 | {stop, Reason} -> 96 | {stop, Reason, State} 97 | end. 98 | 99 | handle_info({'EXIT', Pid, Reason}, #state{supervisor = Pid} = State) -> 100 | {stop, Reason, State#state{supervisor = undefined}}; 101 | handle_info({'EXIT', _Pid, Reason}, State) -> 102 | {stop, Reason, State}; 103 | handle_info(Msg, State) -> 104 | ?WARNING("Ignored an unexpected message: ~p", [Msg]), 105 | {noreply, State}. 106 | 107 | terminate(Reason, #state{supervisor = Supervisor}) -> 108 | case is_pid(Supervisor) of 109 | true -> 110 | chronicle_utils:terminate_linked_process(Supervisor, Reason); 111 | false -> 112 | ok 113 | end. 114 | 115 | %% internal 116 | handle_mod_state(NewModState, #state{mod_state = OldModState} = State) -> 117 | case NewModState =:= OldModState of 118 | true -> 119 | State; 120 | false -> 121 | manage_children(State#state{mod_state = NewModState}) 122 | end. 123 | 124 | manage_children(#state{supervisor = Pid, 125 | mod = Module, 126 | mod_state = ModState, 127 | child_specs = OldSpecs} = State) -> 128 | NewSpecs = Module:child_specs(ModState), 129 | 130 | Removed = OldSpecs -- NewSpecs, 131 | Added = NewSpecs -- OldSpecs, 132 | 133 | stop_children(Pid, Removed), 134 | start_children(Pid, Added), 135 | 136 | State#state{child_specs = NewSpecs}. 137 | 138 | stop_children(Pid, Specs) -> 139 | lists:foreach( 140 | fun (Spec) -> 141 | stop_child(Pid, Spec) 142 | end, Specs). 143 | 144 | stop_child(Pid, #{id := Id}) -> 145 | case stop_child_loop(Pid, Id, 5) of 146 | ok -> 147 | ok; 148 | {error, Error} -> 149 | exit({failed_to_stop_child, Id, Error}) 150 | end. 151 | 152 | stop_child_loop(Pid, Id, Retries) -> 153 | case supervisor:terminate_child(Pid, Id) of 154 | ok -> 155 | case supervisor:delete_child(Pid, Id) of 156 | ok -> 157 | ok; 158 | {error, Error} 159 | when Error =:= running; 160 | Error =:= restarting -> 161 | %% There's no way to terminate and delete a child 162 | %% atomically. Depending on the restart strategy, the 163 | %% child that we just terminated above might be running 164 | %% again by the time we're trying to delete it. To work 165 | %% around this, we retry a couple of times. 166 | case Retries of 167 | 0 -> 168 | {error, exceeded_retries}; 169 | _ -> 170 | stop_child_loop(Pid, Id, Retries - 1) 171 | end; 172 | {error, _} = Error -> 173 | Error 174 | end; 175 | {error, _} = Error -> 176 | Error 177 | end. 178 | 179 | start_children(Pid, Specs) -> 180 | lists:foreach( 181 | fun (Spec) -> 182 | case supervisor:start_child(Pid, Spec) of 183 | {ok, _} -> 184 | ok; 185 | {error, Error} -> 186 | exit({failed_to_start_child, Spec, Error}) 187 | end 188 | end, Specs). 189 | -------------------------------------------------------------------------------- /start_cluster: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import atexit 4 | import glob 5 | import os 6 | from os import path 7 | import subprocess 8 | import time 9 | 10 | ROOT_DIR = path.dirname(__file__) 11 | DEFAULT_DATA_DIR = path.join(ROOT_DIR, 'cluster') 12 | DEFAULT_PROFILE = 'default' 13 | DEFAULT_HOSTNAME = 'localhost.localdomain' 14 | 15 | PROFILE_DEFAULT = 'default' 16 | PROFILE_EXAMPLES = 'examples' 17 | 18 | DEFAULT_APP = 'chronicle' 19 | APPS = {DEFAULT_APP: PROFILE_DEFAULT} 20 | 21 | for example_app in glob.glob(path.join(ROOT_DIR, "examples/*")): 22 | APPS[path.basename(example_app)] = PROFILE_EXAMPLES 23 | 24 | def mkdir(path): 25 | os.makedirs(path, exist_ok=True) 26 | 27 | def get_ebin_path(profile): 28 | return glob.glob(path.join(ROOT_DIR, 29 | "_build/{0}/lib/*/ebin").format(profile)) 30 | 31 | def start_node(i, hostname, ebin_path, data_dir, app): 32 | name='chronicle_{0}@{1}'.format(i, hostname) 33 | node_dir = path.join(data_dir, name) 34 | mkdir(node_dir) 35 | 36 | node_data_dir = path.join(node_dir, 'data') 37 | mkdir(node_data_dir) 38 | 39 | log_path = path.join(node_dir, 'log') 40 | log_file = open(log_path, 'a') 41 | 42 | script = ''' 43 | {{ok, _}} = application:ensure_all_started({}, permanent) 44 | '''.format(app) 45 | args = ['erl', '-pa'] + ebin_path + \ 46 | ['+Bd', 47 | '+sbwt', 'none', 48 | '-name', name, 49 | '-noinput', 50 | '-kernel', 'logger_level', 'debug', 51 | '-kernel', 'error_logger_format_depth', '40', 52 | '-chronicle', 'data_dir', '"{}"'.format(node_data_dir), 53 | '-{}'.format(app), 'instance', '{}'.format(i), 54 | '-eval', script] 55 | process = subprocess.Popen(args, stdin=None, 56 | stdout=log_file, stderr=log_file) 57 | atexit.register(lambda: kill_node(process)) 58 | 59 | return process 60 | 61 | def kill_node(process): 62 | try: 63 | process.kill() 64 | except OSError: 65 | pass 66 | 67 | def start_cluster(args): 68 | app = args.app 69 | ebin_path = get_ebin_path(APPS[app]) 70 | hostname = args.hostname 71 | data_dir = args.data_dir 72 | mkdir(data_dir) 73 | 74 | nodes = [] 75 | for i in range(args.start_index, args.start_index + args.num_nodes): 76 | nodes.append(start_node(i, hostname, ebin_path, data_dir, app)) 77 | 78 | return nodes 79 | 80 | def poll_processes(processes): 81 | while True: 82 | for p in processes: 83 | if p.poll() is not None: 84 | return 85 | 86 | time.sleep(0.1) 87 | 88 | def main(): 89 | parser = argparse.ArgumentParser() 90 | parser.add_argument('--num-nodes', type=int, 91 | dest='num_nodes', required=True) 92 | parser.add_argument('--start-index', type=int, 93 | dest='start_index', default=0) 94 | parser.add_argument('--data-dir', dest='data_dir', default=DEFAULT_DATA_DIR) 95 | parser.add_argument('--app', dest='app', 96 | default=DEFAULT_APP, choices=APPS.keys()) 97 | parser.add_argument('--hostname', dest='hostname', default=DEFAULT_HOSTNAME) 98 | args = parser.parse_args() 99 | 100 | nodes = start_cluster(args) 101 | poll_processes(nodes) 102 | 103 | if __name__ == '__main__': 104 | main() 105 | -------------------------------------------------------------------------------- /test/chronicle_log_tests.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2021 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | -module(chronicle_log_tests). 17 | 18 | -include_lib("eunit/include/eunit.hrl"). 19 | 20 | get_dir() -> 21 | {ok, CurDir} = file:get_cwd(), 22 | filename:join([CurDir, "tmp", "log_tests"]). 23 | 24 | prepare_dir() -> 25 | Dir = get_dir(), 26 | ok = chronicle_utils:delete_recursive(Dir), 27 | ok = chronicle_utils:mkdir_p(Dir). 28 | 29 | prepare() -> 30 | prepare_dir(), 31 | chronicle_env:setup_logger(). 32 | 33 | simple_test_() -> 34 | {spawn, {setup, fun prepare/0, fun simple_test__/0}}. 35 | 36 | simple_test__() -> 37 | Dir = get_dir(), 38 | Path1 = filename:join([Dir, "log1"]), 39 | Path2 = filename:join([Dir, "log2"]), 40 | {ok, Log} = chronicle_log:create(Path1, user_data), 41 | {ok, _} = chronicle_log:append(Log, [1, 2]), 42 | {ok, _} = chronicle_log:append(Log, [3, 4, 5]), 43 | ok = chronicle_log:close(Log), 44 | 45 | ?assertEqual({ok, [user_data, 1, 2, 3, 4, 5]}, read_log(Path1)), 46 | 47 | {ok, _} = file:copy(Path1, Path2), 48 | {ok, Fd1} = file:open(Path1, [raw, binary, read, write]), 49 | 50 | {ok, Pos} = file:position(Fd1, eof), 51 | {ok, <>} = file:pread(Fd1, Pos div 2, 1), 52 | ok = file:pwrite(Fd1, Pos div 2, <<(Byte + 1)>>), 53 | ok = file:close(Fd1), 54 | 55 | ?assertMatch({error, {corrupt_log, _}}, read_log(Path1)), 56 | 57 | {ok, Fd2} = file:open(Path2, [raw, binary, read, write]), 58 | {ok, _} = file:position(Fd2, 3 * (Pos div 4)), 59 | ok = file:truncate(Fd2), 60 | ok = file:close(Fd2), 61 | 62 | ?assertMatch({error, {unexpected_eof, _}}, read_log(Path2)), 63 | 64 | ok = repair_log(Path2), 65 | {ok, Items} = read_log(Path2), 66 | ?assert(lists:prefix(Items, [user_data, 1, 2, 3, 4, 5])), 67 | 68 | ok. 69 | 70 | read_log(Path) -> 71 | Append = fun (Item, Acc) -> [Item | Acc] end, 72 | case chronicle_log:read_log(Path, Append, Append, []) of 73 | {ok, Items} -> 74 | {ok, lists:reverse(Items)}; 75 | Other -> 76 | Other 77 | end. 78 | 79 | repair_log(Path) -> 80 | {ok, Log, state} = chronicle_log:open(Path, 81 | fun (_, S) -> S end, 82 | fun (_, S) -> S end, 83 | state), 84 | ok = chronicle_log:close(Log). 85 | -------------------------------------------------------------------------------- /test/chronicle_peers_vnet.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2020 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | -module(chronicle_peers_vnet). 17 | 18 | -include("chronicle.hrl"). 19 | 20 | -behavior(gen_server). 21 | 22 | -export([start_link/0]). 23 | -export([get_live_peers/0, monitor/0]). 24 | 25 | -export([init/1, handle_call/3, handle_cast/2, handle_info/2]). 26 | 27 | -record(state, { clients, vnodes }). 28 | 29 | start_link() -> 30 | gen_server:start_link(?START_NAME(?MODULE), ?MODULE, [], []). 31 | 32 | get_live_peers() -> 33 | gen_server:call(?SERVER_NAME(?MODULE), get_live_peers). 34 | 35 | monitor() -> 36 | gen_server:call(?SERVER_NAME(?MODULE), {monitor, self()}). 37 | 38 | %% gen_server callbacks 39 | init([]) -> 40 | {ok, _} = timer:send_interval(1000, refresh), 41 | {ok, #state{clients = [], 42 | vnodes = get_vnodes()}}. 43 | 44 | handle_call({monitor, Pid}, _From, #state{clients = Clients} = State) -> 45 | erlang:monitor(process, Pid), 46 | {reply, ok, State#state{clients = [Pid | Clients]}}; 47 | handle_call(get_live_peers, _From, State) -> 48 | NewState = refresh(State), 49 | {reply, NewState#state.vnodes, NewState}. 50 | 51 | handle_cast(Cast, State) -> 52 | {stop, {unexpected_cast, Cast}, State}. 53 | 54 | handle_info(refresh, State) -> 55 | {noreply, refresh(State)}; 56 | handle_info({'DOWN', _MRef, process, Pid, _}, 57 | #state{clients = Clients} = State) -> 58 | {noreply, State#state{clients = lists:delete(Pid, Clients)}}. 59 | 60 | %% internal 61 | refresh(#state{clients = Clients, 62 | vnodes = OldVNodes} = State) -> 63 | NewVNodes = get_vnodes(), 64 | 65 | Up = NewVNodes -- OldVNodes, 66 | Down = OldVNodes -- NewVNodes, 67 | 68 | lists:foreach( 69 | fun (Pid) -> 70 | lists:foreach( 71 | fun (VNode) -> 72 | Reason = 73 | case VNode =:= vnet:vnode() of 74 | true -> 75 | net_kernel_terminated; 76 | false -> 77 | net_tick_timeout 78 | end, 79 | 80 | Pid ! {nodedown, VNode, [{nodedown_reason, Reason}]} 81 | end, Down), 82 | 83 | lists:foreach( 84 | fun (VNode) -> 85 | Pid ! {nodeup, VNode, []} 86 | end, Up) 87 | end, Clients), 88 | 89 | State#state{vnodes = NewVNodes}. 90 | 91 | get_vnodes() -> 92 | Children = supervisor:which_children(vnet), 93 | VNodes = lists:filtermap( 94 | fun ({Id, _Pid, worker, [Type]}) -> 95 | case Type of 96 | vnet_conn -> 97 | false; 98 | vnet_node -> 99 | case is_vnode_started(Id) of 100 | true -> 101 | {true, Id}; 102 | false -> 103 | false 104 | end 105 | end 106 | end, Children), 107 | lists:sort(VNodes). 108 | 109 | is_vnode_started(VNode) -> 110 | case sys:get_state(vnet:node_proc(VNode)) of 111 | {started, _} -> 112 | true; 113 | {stopped, _} -> 114 | false 115 | end. 116 | -------------------------------------------------------------------------------- /test/dynamic_supervisor_tests.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2020 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | -module(dynamic_supervisor_tests). 17 | 18 | -include_lib("eunit/include/eunit.hrl"). 19 | 20 | -compile(nowarn_export_all). 21 | -compile(export_all). 22 | 23 | start_link() -> 24 | start_link(stopped). 25 | 26 | start_link(State) -> 27 | dynamic_supervisor:start_link(?MODULE, State). 28 | 29 | init(State) -> 30 | Flags = #{strategy => one_for_one, 31 | intensity => 1, 32 | period => 10}, 33 | {ok, Flags, State}. 34 | 35 | handle_event({state, NewState}, _) -> 36 | {noreply, NewState}. 37 | 38 | child_specs(stopped) -> 39 | []; 40 | child_specs(started) -> 41 | [child_spec(a), child_spec(b)]. 42 | 43 | child_spec(Name) -> 44 | #{id => Name, 45 | start => {?MODULE, start_child, [Name]}}. 46 | 47 | start_child(Name) -> 48 | proc_lib:start_link( 49 | erlang, apply, 50 | [fun () -> 51 | register(Name, self()), 52 | proc_lib:init_ack({ok, self()}), 53 | receive 54 | _ -> ok 55 | end 56 | end, 57 | []]). 58 | 59 | basic_test_() -> 60 | {spawn, 61 | fun () -> 62 | process_flag(trap_exit, true), 63 | 64 | {ok, Pid} = start_link(), 65 | undefined = whereis(a), 66 | undefined = whereis(b), 67 | 68 | dynamic_supervisor:send_event(Pid, {state, started}), 69 | dynamic_supervisor:sync(Pid, 10000), 70 | true = is_pid(whereis(a)), 71 | true = is_pid(whereis(b)), 72 | 73 | dynamic_supervisor:send_event(Pid, {state, stopped}), 74 | dynamic_supervisor:sync(Pid, 10000), 75 | undefined = whereis(a), 76 | undefined = whereis(b), 77 | 78 | dynamic_supervisor:send_event(Pid, {state, started}), 79 | dynamic_supervisor:sync(Pid, 10000), 80 | chronicle_utils:terminate_linked_process(Pid, shutdown), 81 | undefined = whereis(a), 82 | undefined = whereis(b) 83 | end}. 84 | 85 | sync_start_test_() -> 86 | {spawn, 87 | fun () -> 88 | process_flag(trap_exit, true), 89 | {ok, Pid} = start_link(started), 90 | true = is_pid(whereis(a)), 91 | true = is_pid(whereis(b)), 92 | 93 | chronicle_utils:terminate_linked_process(Pid, shutdown) 94 | end}. 95 | 96 | restart_test_() -> 97 | {spawn, 98 | fun () -> 99 | process_flag(trap_exit, true), 100 | {ok, Pid} = start_link(started), 101 | exit(whereis(a), shutdown), 102 | receive 103 | {'EXIT', Pid, _} -> 104 | exit(bad) 105 | after 106 | 100 -> 107 | ok 108 | end, 109 | 110 | exit(whereis(b), shutdown), 111 | receive 112 | {'EXIT', Pid, _} -> 113 | ok 114 | after 115 | 100 -> 116 | exit(bad2) 117 | end 118 | end}. 119 | -------------------------------------------------------------------------------- /test/misc_tests.erl: -------------------------------------------------------------------------------- 1 | %% @author Couchbase 2 | %% @copyright 2020 Couchbase, Inc. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %% 16 | -module(misc_tests). 17 | 18 | -include("chronicle.hrl"). 19 | 20 | -include_lib("eunit/include/eunit.hrl"). 21 | 22 | flush_test() -> 23 | self() ! {test, 1}, 24 | self() ! {test2, 1}, 25 | self() ! {test, 2}, 26 | self() ! {test, 3}, 27 | self() ! {test2, 2}, 28 | 29 | 3 = ?FLUSH({test, _}), 30 | 2 = ?FLUSH({test2, _}). 31 | --------------------------------------------------------------------------------