├── rebar.lock ├── CODE_OF_CONDUCT.md ├── .gitignore ├── rebar.config ├── src ├── wa_raft.app.src ├── wa_raft_app.erl ├── wa_raft_label.erl ├── wa_raft.erl ├── wa_raft_app_sup.erl ├── wa_raft_distribution.erl ├── wa_raft_transport_sup.erl ├── wa_raft_transport_target_sup.erl ├── wa_raft_metrics.erl ├── wa_raft_env.erl ├── wa_raft_durable_state.erl ├── wa_raft_info.erl ├── wa_raft_transport_cleanup.erl ├── wa_raft_transport_worker.erl ├── wa_raft_sup.erl ├── wa_raft_log_ets.erl ├── wa_raft_storage_ets.erl ├── wa_raft_part_sup.erl ├── wa_raft_dist_transport.erl ├── wa_raft_snapshot_catchup.erl ├── wa_raft_acceptor.erl ├── wa_raft_log_catchup.erl └── wa_raft_queue.erl ├── examples └── kvstore │ └── src │ ├── kvstore_app.erl │ ├── kvstore.app.src │ ├── kvstore_sup.erl │ └── kvstore_client.erl ├── include ├── wa_raft_logger.hrl ├── wa_raft_rpc.hrl └── wa_raft.hrl ├── .github └── workflows │ └── build.yml ├── CONTRIBUTING.md ├── README.md └── LICENSE /rebar.lock: -------------------------------------------------------------------------------- 1 | []. 2 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | Meta has adopted a Code of Conduct that we expect project participants to adhere to. 4 | Please read the [full text](https://code.fb.com/codeofconduct/) 5 | so that you can understand what actions will and will not be tolerated. 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .rebar3 3 | .eunit 4 | *.o 5 | *.beam 6 | *.plt 7 | *.swp 8 | *.swo 9 | ebin 10 | log 11 | erl_crash.dump 12 | .rebar 13 | _build 14 | .idea 15 | rebar3.crashdump 16 | .edts 17 | *.coverdata 18 | *.log 19 | *.log.* 20 | doc 21 | # Emacs Backup files 22 | *~ 23 | # Emacs temporary files 24 | .#* 25 | *# 26 | -------------------------------------------------------------------------------- /rebar.config: -------------------------------------------------------------------------------- 1 | {erl_opts, [ debug_info 2 | , warnings_as_errors 3 | , warn_export_vars 4 | , warn_unused_import 5 | ] 6 | }. 7 | 8 | {deps, []}. 9 | 10 | {dialyzer, [ {warnings, [unknown]} 11 | , {plt_apps, all_deps} 12 | ]}. 13 | 14 | {xref_checks, [ undefined_function_calls 15 | , undefined_functions 16 | , locals_not_used 17 | , deprecated_function_calls 18 | , deprecated_functions 19 | ]}. 20 | -------------------------------------------------------------------------------- /src/wa_raft.app.src: -------------------------------------------------------------------------------- 1 | %% % @format 2 | 3 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 4 | %%% 5 | %%% This source code is licensed under the Apache 2.0 license found in 6 | %%% the LICENSE file in the root directory of this source tree. 7 | 8 | {application, wa_raft, [ 9 | {description, "Erlang implementation of RAFT Consensus Protocol"}, 10 | {vsn, "1.0.0"}, 11 | {modules, []}, 12 | {registered, []}, 13 | %% NOTE: No more dependency is expected for this app 14 | {applications, [ 15 | kernel, 16 | stdlib 17 | ]}, 18 | {env, []}, 19 | {mod, {wa_raft_app, []}} 20 | ]}. 21 | -------------------------------------------------------------------------------- /examples/kvstore/src/kvstore_app.erl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | 6 | -module(kvstore_app). 7 | -compile(warn_missing_spec_all). 8 | 9 | -behaviour(application). 10 | 11 | %% API 12 | -export([ 13 | start/2, 14 | stop/1 15 | ]). 16 | 17 | -spec start(application:start_type(), term()) -> {ok, pid()}. 18 | start(normal, _Args) -> 19 | {ok, _Pid} = kvstore_sup:start_link(). 20 | 21 | -spec stop(term()) -> ok. 22 | stop(_State) -> 23 | ok. 24 | -------------------------------------------------------------------------------- /src/wa_raft_app.erl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | %%% 6 | %%% Application implementation for wa_raft. 7 | 8 | -module(wa_raft_app). 9 | -compile(warn_missing_spec_all). 10 | -behaviour(application). 11 | 12 | %% Application callbacks 13 | -export([ 14 | start/2, 15 | stop/1 16 | ]). 17 | 18 | -spec start(StartType :: application:start_type(), StartArgs :: term()) -> {ok, pid()}. 19 | start(normal, _Args) -> 20 | {ok, _Pid} = wa_raft_app_sup:start_link(). 21 | 22 | -spec stop(State :: term()) -> ok. 23 | stop(_State) -> 24 | ok. 25 | -------------------------------------------------------------------------------- /src/wa_raft_label.erl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | %%% 6 | %%% Pluggable module for labeling log entries before adding them to the RAFT log. 7 | 8 | -module(wa_raft_label). 9 | -compile(warn_missing_spec_all). 10 | 11 | -type label() :: dynamic(). 12 | 13 | -export_type([label/0]). 14 | 15 | %%% ------------------------------------------------------------------------ 16 | %%% Behaviour callbacks 17 | %%% 18 | 19 | % Produce a label for a new log record based on the log payload and the label of the preceeding log entry. 20 | -callback new_label(LastLabel :: label(), Command :: wa_raft_acceptor:command()) -> NewLabel :: label(). 21 | -------------------------------------------------------------------------------- /examples/kvstore/src/kvstore.app.src: -------------------------------------------------------------------------------- 1 | %% % @format 2 | 3 | %% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 4 | %% 5 | %% This source code is licensed under the Apache 2.0 license found in 6 | %% the LICENSE file in the root directory of this source tree. 7 | 8 | {application, kvstore, [ 9 | {description, "Distributed Key-Value Storage"}, 10 | {vsn, "1.0.0"}, 11 | {modules, []}, 12 | {registered, [kvstore_sup]}, 13 | {applications, [ 14 | kernel, 15 | stdlib, 16 | wa_raft 17 | ]}, 18 | {env, [ 19 | % Specify where you want your data to be stored here 20 | {raft_database, "/mnt/kvstore"}, 21 | % Specify your own implementations here 22 | {raft_log_module, wa_raft_log_ets}, 23 | {raft_storage_module, wa_raft_storage_ets}, 24 | {raft_distribution_module, wa_raft_distribution}, 25 | {raft_transport_module, wa_raft_transport} 26 | ]}, 27 | {mod, {kvstore_app, []}} 28 | ]}. 29 | -------------------------------------------------------------------------------- /src/wa_raft.erl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | %%% 6 | %%% This file defines dialyzer types. 7 | 8 | -module(wa_raft). 9 | -compile(warn_missing_spec_all). 10 | 11 | -include_lib("wa_raft/include/wa_raft.hrl"). 12 | 13 | %% Public Types 14 | -export_type([ 15 | table/0, 16 | partition/0, 17 | args/0, 18 | identity/0 19 | ]). 20 | 21 | -type table() :: atom(). 22 | -type partition() :: pos_integer(). 23 | 24 | %% Specification for starting a RAFT partition. 25 | -type args() :: 26 | #{ 27 | % Table name 28 | table := table(), 29 | % Partition number 30 | partition := partition(), 31 | % Distribution module 32 | distribution_module => module(), 33 | % Log module 34 | log_module => module(), 35 | % Log label module 36 | label_module => module(), 37 | % Storage module 38 | storage_module => module(), 39 | % Transport module 40 | transport_module => module() 41 | }. 42 | 43 | -type identity() :: #raft_identity{}. 44 | -------------------------------------------------------------------------------- /examples/kvstore/src/kvstore_sup.erl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | 6 | %%% 7 | %%% This supervisor starts 4 RAFT partitions under itself. 8 | %%% 9 | 10 | -module(kvstore_sup). 11 | -compile(warn_missing_spec_all). 12 | 13 | -behaviour(supervisor). 14 | 15 | -export([ 16 | start_link/0, 17 | init/1 18 | ]). 19 | 20 | -spec start_link() -> supervisor:startlink_ret(). 21 | start_link() -> 22 | supervisor:start_link({local, ?MODULE}, ?MODULE, []). 23 | 24 | -spec init(term()) -> {ok, {supervisor:sup_flags(), [supervisor:child_spec()]}}. 25 | init([]) -> 26 | Partitions = [1, 2, 3, 4], 27 | Args = [raft_args(P) || P <- Partitions], 28 | ChildSpecs = [ 29 | wa_raft_sup:child_spec(Args) 30 | ], 31 | {ok, {#{}, ChildSpecs}}. 32 | 33 | % Construct a RAFT "args" for a partition. 34 | -spec raft_args(Partition :: wa_raft:partition()) -> wa_raft:args(). 35 | raft_args(Partition) -> 36 | % RAFT clusters are primarily identified by their table and partition number 37 | #{table => kvstore, partition => Partition}. 38 | -------------------------------------------------------------------------------- /include/wa_raft_logger.hrl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | %%% 6 | 7 | -include_lib("kernel/include/logger.hrl"). 8 | 9 | -define(RAFT_LOG_OPTS, #{domain => [whatsapp, wa_raft]}). 10 | 11 | -define(RAFT_LOG_ERROR(Message), ?LOG_ERROR(Message, ?RAFT_LOG_OPTS)). 12 | -define(RAFT_LOG_ERROR(Format, Args), ?LOG_ERROR(Format, Args, ?RAFT_LOG_OPTS)). 13 | 14 | -define(RAFT_LOG_WARNING(Message), ?LOG_WARNING(Message, ?RAFT_LOG_OPTS)). 15 | -define(RAFT_LOG_WARNING(Format, Args), ?LOG_WARNING(Format, Args, ?RAFT_LOG_OPTS)). 16 | 17 | -define(RAFT_LOG_NOTICE(Message), ?LOG_NOTICE(Message, ?RAFT_LOG_OPTS)). 18 | -define(RAFT_LOG_NOTICE(Format, Args), ?LOG_NOTICE(Format, Args, ?RAFT_LOG_OPTS)). 19 | 20 | -define(RAFT_LOG_INFO(Message), ?LOG_INFO(Message, ?RAFT_LOG_OPTS)). 21 | -define(RAFT_LOG_INFO(Format, Args), ?LOG_INFO(Format, Args, ?RAFT_LOG_OPTS)). 22 | 23 | -define(RAFT_LOG_DEBUG(Message), ?LOG_DEBUG(Message, ?RAFT_LOG_OPTS)). 24 | -define(RAFT_LOG_DEBUG(Format, Args), ?LOG_DEBUG(Format, Args, ?RAFT_LOG_OPTS)). 25 | 26 | -define(RAFT_LOG(Level, Message), ?LOG(Level, Message, ?RAFT_LOG_OPTS)). 27 | -define(RAFT_LOG(Level, Format, Args), ?LOG(Level, Format, Args, ?RAFT_LOG_OPTS)). 28 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | on: 3 | push: 4 | branches: 5 | - '*' 6 | pull_request: 7 | types: 8 | - opened 9 | - synchronize 10 | jobs: 11 | linux: 12 | strategy: 13 | matrix: 14 | platform: [ubuntu-latest] 15 | otp-version: [24] 16 | runs-on: ${{ matrix.platform }} 17 | container: 18 | image: erlang:${{ matrix.otp-version }} 19 | steps: 20 | - name: Checkout 21 | uses: actions/checkout@v2 22 | - name: Cache Hex packages 23 | uses: actions/cache@v1 24 | with: 25 | path: ~/.cache/rebar3/hex/hexpm/packages 26 | key: ${{ runner.os }}-hex-${{ hashFiles(format('{0}{1}', github.workspace, '/rebar.lock')) }} 27 | restore-keys: | 28 | ${{ runner.os }}-hex- 29 | - name: Cache Dialyzer PLTs 30 | uses: actions/cache@v1 31 | with: 32 | path: ~/.cache/rebar3/rebar3_*_plt 33 | key: ${{ runner.os }}-dialyzer-${{ hashFiles(format('{0}{1}', github.workspace, '/rebar.config')) }} 34 | restore-keys: | 35 | ${{ runner.os }}-dialyzer- 36 | - name: Compile 37 | run: rebar3 compile 38 | - name: Generate Dialyzer PLT 39 | run: dialyzer --build_plt --apps erts kernel stdlib 40 | - name: Run CT Tests 41 | run: rebar3 ct 42 | - name: Run Checks 43 | run: rebar3 do dialyzer, xref 44 | - name: Produce Documentation 45 | run: rebar3 edoc 46 | if: ${{ matrix.otp-version == '24' }} 47 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to WhatsApp Raft 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Our Development Process 6 | We expect to ship changes to existing setup scripts and add new setup scripts on an ongoing basis. 7 | 8 | ## Pull Requests 9 | We actively welcome your pull requests. 10 | 11 | 1. Fork the repo and create your branch from `main`. 12 | 2. Make sure your changes lint and work with all past and present versions of WhatsApp RAFT. 13 | 3. If you haven't already, complete the Contributor License Agreement ("CLA"). 14 | 15 | ## Contributor License Agreement ("CLA") 16 | In order to accept your pull request, we need you to submit a CLA. You only need 17 | to do this once to work on any of Meta's open source projects. 18 | 19 | Complete your CLA here: 20 | 21 | ## Issues 22 | We use GitHub issues to track public bugs. Please ensure your description is 23 | clear and has sufficient instructions to be able to reproduce the issue. 24 | 25 | For issues on your integration with WhatsApp Raft, please use our 26 | support channel at . 27 | 28 | Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe 29 | disclosure of security bugs. In those cases, please go through the process 30 | outlined on that page and do not file a public issue. 31 | 32 | ## License 33 | By contributing to WhatsApp Raft, you agree that your contributions will be licensed 34 | under the LICENSE file in the root directory of this source tree. 35 | -------------------------------------------------------------------------------- /examples/kvstore/src/kvstore_client.erl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | 6 | %%% 7 | %%% This module offers APIs to access the storage. 8 | %%% 9 | -module(kvstore_client). 10 | -compile(warn_missing_spec_all). 11 | 12 | -export([ 13 | read/1, 14 | write/2, 15 | delete/1 16 | ]). 17 | 18 | -include_lib("wa_raft/include/wa_raft.hrl"). 19 | 20 | -define(CALL_TIMEOUT, 5000). 21 | -define(TABLE, kvstore). 22 | -define(NUM_PARTITIONS, 4). 23 | 24 | %% Read value for a given key. It's a blocking call. 25 | -spec read(term()) -> {ok, term()} | wa_raft_acceptor:read_error(). 26 | read(Key) -> 27 | Acceptor = ?RAFT_ACCEPTOR_NAME(?TABLE, partition(Key)), 28 | wa_raft_acceptor:read(Acceptor, {read, ?TABLE, Key}, ?CALL_TIMEOUT). 29 | 30 | %% Write a key/value pair to storage. It's a blocking call. 31 | -spec write(term(), term()) -> ok | wa_raft_acceptor:commit_error(). 32 | write(Key, Value) -> 33 | commit(Key, {write, ?TABLE, Key, Value}). 34 | 35 | %% Delete a key/value pair. It's a blocking call. 36 | -spec delete(term()) -> ok | wa_raft_acceptor:commit_error(). 37 | delete(Key) -> 38 | commit(Key, {delete, ?TABLE, Key}). 39 | 40 | -spec commit(term(), term()) -> term() | wa_raft_acceptor:commit_error(). 41 | commit(Key, Command) -> 42 | Acceptor = ?RAFT_ACCEPTOR_NAME(?TABLE, partition(Key)), 43 | wa_raft_acceptor:commit(Acceptor, {make_ref(), Command}, ?CALL_TIMEOUT). 44 | 45 | -spec partition(term()) -> number(). 46 | partition(Key) -> 47 | erlang:phash2(Key, ?NUM_PARTITIONS) + 1. 48 | -------------------------------------------------------------------------------- /src/wa_raft_app_sup.erl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | %%% 6 | %%% Application supervisor to be started by the wa_raft application for 7 | %%% supervising services and resources shared between application-started 8 | %%% RAFT processes. 9 | 10 | -module(wa_raft_app_sup). 11 | -compile(warn_missing_spec_all). 12 | -behaviour(supervisor). 13 | 14 | %% API 15 | -export([ 16 | start_link/0 17 | ]). 18 | 19 | %% Supervisor callbacks 20 | -export([ 21 | init/1 22 | ]). 23 | 24 | -include_lib("wa_raft/include/wa_raft.hrl"). 25 | 26 | -spec start_link() -> supervisor:startlink_ret(). 27 | start_link() -> 28 | supervisor:start_link({local, ?MODULE}, ?MODULE, []). 29 | 30 | -spec init(Arg :: term()) -> {ok, {supervisor:sup_flags(), [supervisor:child_spec()]}}. 31 | init(_) -> 32 | % Cache certain commonly used configuration values. 33 | case ?RAFT_METRICS_MODULE() of 34 | {ok, Module} -> wa_raft_metrics:install(Module); 35 | _Other -> ok 36 | end, 37 | 38 | % Setup tables used by shared services. 39 | wa_raft_info:init_tables(), 40 | wa_raft_transport:setup_tables(), 41 | wa_raft_log_catchup:init_tables(), 42 | 43 | % Configure startup of shared services. 44 | ChildSpecs = [ 45 | wa_raft_transport:child_spec(), 46 | wa_raft_transport_sup:child_spec(), 47 | wa_raft_dist_transport:child_spec(), 48 | wa_raft_snapshot_catchup:child_spec() 49 | ], 50 | 51 | {ok, {#{strategy => one_for_one, intensity => 5, period => 1}, lists:flatten(ChildSpecs)}}. 52 | -------------------------------------------------------------------------------- /src/wa_raft_distribution.erl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | %%% 6 | %%% Pluggable distribution interface. The default implementation uses Erlang 7 | %%% distribution. 8 | 9 | -module(wa_raft_distribution). 10 | -compile(warn_missing_spec_all). 11 | 12 | -export([ 13 | cast/3, 14 | call/4, 15 | reply/3 16 | ]). 17 | 18 | -include_lib("wa_raft/include/wa_raft.hrl"). 19 | 20 | -type dest_addr() :: {Name :: atom(), Node :: node()}. 21 | 22 | -export_type([ 23 | dest_addr/0 24 | ]). 25 | 26 | %%% ------------------------------------------------------------------------ 27 | %%% Behaviour callbacks 28 | %%% 29 | 30 | -callback cast(dest_addr(), #raft_identifier{}, term()) -> term(). 31 | -callback call(dest_addr(), #raft_identifier{}, term(), integer() | infinity) -> term(). 32 | -callback reply(gen_server:from() | gen_statem:from(), #raft_identifier{}, term()) -> term(). 33 | 34 | %%% ------------------------------------------------------------------------ 35 | %%% Erlang distribution default implementation 36 | %%% 37 | 38 | -spec cast(DestAddr :: dest_addr(), Identifier :: #raft_identifier{}, Message :: term()) -> term(). 39 | cast(DestAddr, _Identifier, Message) -> 40 | erlang:send(DestAddr, {'$gen_cast', Message}, [noconnect, nosuspend]). 41 | 42 | -spec call(DestAddr :: dest_addr(), Identifier :: #raft_identifier{}, Message :: term(), Timeout :: integer() | infinity) -> term(). 43 | call(DestAddr, _Identifier, Message, Timeout) -> 44 | gen_server:call(DestAddr, Message, Timeout). 45 | 46 | -spec reply(From :: gen_server:from() | gen_statem:from(), Identifier :: #raft_identifier{}, Reply :: term()) -> term(). 47 | reply(From, _Identifier, Reply) -> 48 | gen:reply(From, Reply). 49 | -------------------------------------------------------------------------------- /src/wa_raft_transport_sup.erl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | %%% 6 | %%% OTP supervisor for handling workers responsible for actual data 7 | %%% send and receive for RAFT transport mechanisms. 8 | 9 | -module(wa_raft_transport_sup). 10 | -compile(warn_missing_spec_all). 11 | -behaviour(supervisor). 12 | 13 | %% Internal API 14 | -export([ 15 | get_or_start/1 16 | ]). 17 | 18 | %% OTP supervision callbacks 19 | -export([ 20 | child_spec/0, 21 | start_link/0 22 | ]). 23 | 24 | %% supervisor callbacks 25 | -export([ 26 | init/1 27 | ]). 28 | 29 | %%% ------------------------------------------------------------------------ 30 | %%% OTP supervision callbacks 31 | %%% 32 | 33 | -spec get_or_start(node()) -> atom(). 34 | get_or_start(Node) -> 35 | Name = wa_raft_transport_target_sup:name(Node), 36 | not is_pid(whereis(Name)) andalso 37 | supervisor:start_child(?MODULE, wa_raft_transport_target_sup:child_spec(Node)), 38 | Name. 39 | 40 | %%% ------------------------------------------------------------------------ 41 | %%% OTP supervision callbacks 42 | %%% 43 | 44 | -spec child_spec() -> supervisor:child_spec(). 45 | child_spec() -> 46 | #{ 47 | id => ?MODULE, 48 | start => {?MODULE, start_link, []}, 49 | restart => permanent, 50 | shutdown => infinity, 51 | type => supervisor, 52 | modules => [?MODULE] 53 | }. 54 | 55 | -spec start_link() -> supervisor:startlink_ret(). 56 | start_link() -> 57 | supervisor:start_link({local, ?MODULE}, ?MODULE, []). 58 | 59 | %%% ------------------------------------------------------------------------ 60 | %%% supervisor callbacks 61 | %%% 62 | 63 | -spec init(term()) -> {ok, {supervisor:sup_flags(), [supervisor:child_spec()]}}. 64 | init(_) -> 65 | {ok, {#{strategy => one_for_one, intensity => 5, period => 1}, []}}. 66 | -------------------------------------------------------------------------------- /src/wa_raft_transport_target_sup.erl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | %%% 6 | %%% Supervisor responsible for managing workers responsible for the 7 | %%% transport to a particular target node. 8 | 9 | -module(wa_raft_transport_target_sup). 10 | -compile(warn_missing_spec_all). 11 | -behaviour(supervisor). 12 | 13 | %% Internal API 14 | -export([ 15 | name/1 16 | ]). 17 | 18 | %% OTP supervision callbacks 19 | -export([ 20 | child_spec/1, 21 | start_link/1 22 | ]). 23 | 24 | %% Supervisor callbacks 25 | -export([ 26 | init/1 27 | ]). 28 | 29 | -include_lib("wa_raft/include/wa_raft.hrl"). 30 | 31 | %%% ------------------------------------------------------------------------ 32 | %%% Internal API 33 | %%% 34 | 35 | -spec name(node()) -> atom(). 36 | name(Name) -> 37 | binary_to_atom(<<"raft_transport_target_sup_", (atom_to_binary(Name))/binary>>). 38 | 39 | %%% ------------------------------------------------------------------------ 40 | %%% OTP supervision callbacks 41 | %%% 42 | 43 | -spec child_spec(node()) -> supervisor:child_spec(). 44 | child_spec(Node) -> 45 | #{ 46 | id => Node, 47 | start => {?MODULE, start_link, [Node]}, 48 | restart => temporary, 49 | shutdown => infinity, 50 | type => supervisor, 51 | modules => [?MODULE] 52 | }. 53 | 54 | -spec start_link(node()) -> supervisor:startlink_ret(). 55 | start_link(Node) -> 56 | supervisor:start_link({local, name(Node)}, ?MODULE, Node). 57 | 58 | %%% ------------------------------------------------------------------------ 59 | %%% supervisor callbacks 60 | %%% 61 | 62 | -spec init(node()) -> {ok, {supervisor:sup_flags(), [supervisor:child_spec()]}}. 63 | init(Node) -> 64 | NumThreads = ?RAFT_TRANSPORT_THREADS(), 65 | Specs = [wa_raft_transport_worker:child_spec(Node, N) || N <- lists:seq(1, NumThreads)], 66 | {ok, {#{strategy => one_for_all, intensity => 5, period => 1}, Specs}}. 67 | -------------------------------------------------------------------------------- /src/wa_raft_metrics.erl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | %%% 6 | %%% Pluggable metrics interface to allow integration with different metrics system. 7 | %%% The default implementation skips metrics logging and does nothing. 8 | 9 | -module(wa_raft_metrics). 10 | -compile(warn_missing_spec_all). 11 | 12 | %% Public API 13 | -export([ 14 | install/1 15 | ]). 16 | 17 | %% Default Implementation 18 | -export([ 19 | count/1, 20 | countv/2, 21 | gather/2, 22 | gather_latency/2 23 | ]). 24 | 25 | %% Public Types 26 | -export_type([ 27 | metric/0, 28 | value/0 29 | ]). 30 | 31 | -include_lib("wa_raft/include/wa_raft.hrl"). 32 | 33 | %%------------------------------------------------------------------- 34 | %% RAFT Metrics Behaviour 35 | %%------------------------------------------------------------------- 36 | 37 | %% Report a single occurence of some metric. 38 | -callback count(metric()) -> ok. 39 | %% Report a number of occurences of some metric. 40 | -callback countv(metric(), value()) -> ok. 41 | %% Report the measured value of an occurence of some metric. 42 | -callback gather(metric(), value()) -> ok. 43 | %% Report the measured latency of an occurence of some metric. 44 | -callback gather_latency(metric(), value()) -> ok. 45 | 46 | %%------------------------------------------------------------------- 47 | %% Public Types 48 | %%------------------------------------------------------------------- 49 | 50 | -type metric() :: atom() | tuple(). 51 | -type value() :: integer(). 52 | 53 | %%------------------------------------------------------------------- 54 | %% Public API 55 | %%------------------------------------------------------------------- 56 | 57 | %% Replace the previously installed or default module used to report 58 | %% RAFT metrics with the provided module. 59 | -spec install(Module :: module()) -> ok. 60 | install(Module) -> 61 | persistent_term:put(?RAFT_METRICS_MODULE_KEY, Module). 62 | 63 | %%------------------------------------------------------------------- 64 | %% Default Implementation 65 | %%------------------------------------------------------------------- 66 | 67 | -spec count(metric()) -> ok. 68 | count(_Metric) -> 69 | ok. 70 | 71 | -spec countv(metric(), value()) -> ok. 72 | countv(_Metric, _Value) -> 73 | ok. 74 | 75 | -spec gather(metric(), value()) -> ok. 76 | gather(_Metric, _Value) -> 77 | ok. 78 | 79 | -spec gather_latency(metric(), value()) -> ok. 80 | gather_latency(_Metric, _Value) -> 81 | ok. 82 | -------------------------------------------------------------------------------- /src/wa_raft_env.erl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | %%% 6 | %%% This module contains utility functions for consulting application 7 | %%% configuration in the OTP application environment according to the search 8 | %%% order configured for each RAFT partition. 9 | 10 | -module(wa_raft_env). 11 | -compile(warn_missing_spec_all). 12 | 13 | %% Config API 14 | -export([ 15 | database_path/1 16 | ]). 17 | 18 | %% Internal API 19 | -export([ 20 | get_env/2, 21 | get_env/3 22 | ]). 23 | 24 | -type scope() :: Application :: atom() | {Table :: wa_raft:table(), Partition :: wa_raft:partition()} | SearchApps :: [atom()]. 25 | -type key() :: Key :: atom() | {Primary :: atom(), Fallback :: atom()}. 26 | 27 | -include_lib("wa_raft/include/wa_raft.hrl"). 28 | 29 | %%------------------------------------------------------------------- 30 | %% Config API 31 | %%------------------------------------------------------------------- 32 | 33 | -spec database_path(Scope :: scope()) -> Root :: file:filename(). 34 | database_path(Scope) -> 35 | case get_env(Scope, ?RAFT_DATABASE) of 36 | {ok, Root} -> Root; 37 | undefined -> error({no_configured_database_path, Scope}) 38 | end. 39 | 40 | %%------------------------------------------------------------------- 41 | %% Internal API 42 | %%------------------------------------------------------------------- 43 | 44 | -spec get_env(Scope :: scope(), Key :: key()) -> {ok, Value :: dynamic()} | undefined. 45 | get_env(Scope, Key) -> 46 | get_env_impl(search_apps(Scope), key(Key), fallback(Key)). 47 | 48 | -spec get_env(Scope :: scope(), Key :: key(), Default :: Value) -> Value. 49 | get_env(Scope, Key, Default) -> 50 | case get_env(Scope, Key) of 51 | {ok, Value} -> Value; 52 | undefined -> Default 53 | end. 54 | 55 | -spec get_env_impl(SearchApps :: [atom()], Key :: atom(), FallbackKey :: atom()) -> {ok, Value :: dynamic()} | undefined. 56 | get_env_impl([], _Key, FallbackKey) -> 57 | ?RAFT_CONFIG(FallbackKey); 58 | get_env_impl([App | SearchApps], Key, FallbackKey) -> 59 | case application:get_env(App, Key) of 60 | {ok, Value} -> {ok, Value}; 61 | undefined -> get_env_impl(SearchApps, Key, FallbackKey) 62 | end. 63 | 64 | -spec search_apps(Scope :: scope()) -> SearchApps :: [atom()]. 65 | search_apps(Application) when is_atom(Application) -> 66 | case wa_raft_sup:options(Application) of 67 | undefined -> []; 68 | RaftApplication -> RaftApplication#raft_application.config_search_apps 69 | end; 70 | search_apps({Table, Partition}) -> 71 | case wa_raft_part_sup:options(Table, Partition) of 72 | undefined -> []; 73 | Options -> search_apps(Options#raft_options.application) 74 | end; 75 | search_apps(SearchApps) -> 76 | SearchApps. 77 | 78 | -spec key(key()) -> atom(). 79 | key({Key, _}) -> Key; 80 | key(Key) -> Key. 81 | 82 | -spec fallback(key()) -> atom(). 83 | fallback({_, Fallback}) -> Fallback; 84 | fallback(Key) -> Key. 85 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # WhatsApp Raft - WARaft 2 | 3 | WARaft is a Raft library in Erlang by WhatsApp. It provides an Erlang implementation to obtain consensus among replicated state machines. Consensus is a fundamental problem in fault-tolerant distributed systems. WARaft has been used as consensus provider in WhatsApp message storage, which is a large scale strongly consistent storage system across 5+ datacenters. 4 | 5 | ## Features 6 | 7 | * Full implementation of Raft consensus algorithm defined in https://raft.github.io/ 8 | * Extensible framework. It offers pluggable component interface for log, state machines and transport layer. Users are also allowed provide their own implementation to customize . 9 | * Performant. It is highly optimized for large volume transactions user cases. It could support up to 200K/s transactions with in a 5 node cluster. 10 | * Distributed key value store. WARaft provides components needed to build a distributed key-value storage. 11 | 12 | ## Get Started 13 | 14 | The following code snippet gives a quick glance about how WARaft works. It creates a single-node WARaft cluster and writes and reads a record. 15 | 16 | ```erlang 17 | % Setup the WARaft application and the host application 18 | rr(wa_raft_server). 19 | application:ensure_all_started(wa_raft). 20 | application:set_env(test_app, raft_database, "."). 21 | % Create a spec for partition 1 of the RAFT table "test" and start it. 22 | Spec = wa_raft_sup:child_spec(test_app, [#{table => test, partition => 1}]). 23 | % Here we add WARaft to the kernel's supervisor, but you should place WARaft's 24 | % child spec underneath your application's supervisor in a real deployment. 25 | supervisor:start_child(kernel_sup, Spec). 26 | % Check that the RAFT server started successfully 27 | wa_raft_server:status(raft_server_test_1). 28 | % Make a cluster configuration with the current node as the only member 29 | Config = wa_raft_server:make_config([#raft_identity{name = raft_server_test_1, node = node()}]). 30 | % Bootstrap the RAFT server to get it started 31 | wa_raft_server:bootstrap(raft_server_test_1, #raft_log_pos{index = 1, term = 1}, Config, #{}). 32 | % Wait for the RAFT server to become the leader 33 | wa_raft_server:status(raft_server_test_1). 34 | % Read and write against a key 35 | wa_raft_acceptor:commit(raft_acceptor_test_1, {make_ref(), {write, test, key, 1000}}). 36 | wa_raft_acceptor:read(raft_acceptor_test_1, {read, test, key}). 37 | ``` 38 | 39 | A typical output would look like the following: 40 | 41 | ```erlang 42 | 1> % Setup the WARaft application and the host application 43 | rr(wa_raft_server). 44 | [raft_application,raft_identifier,raft_identity,raft_log, 45 | raft_log_pos,raft_options,raft_state] 46 | 2> application:ensure_all_started(wa_raft). 47 | {ok,[wa_raft]} 48 | 3> application:set_env(test_app, raft_database, "."). 49 | ok 50 | 4> % Create a spec for partition 1 of the RAFT table "test" and start it. 51 | Spec = wa_raft_sup:child_spec(test_app, [#{table => test, partition => 1}]). 52 | #{id => wa_raft_sup,restart => permanent,shutdown => infinity, 53 | start => 54 | {wa_raft_sup,start_link, 55 | [test_app,[#{table => test,partition => 1}],#{}]}, 56 | type => supervisor, 57 | modules => [wa_raft_sup]} 58 | 5> % Here we add WARaft to the kernel's supervisor, but you should place WARaft's 59 | % child spec underneath your application's supervisor in a real deployment. 60 | supervisor:start_child(kernel_sup, Spec). 61 | {ok,<0.101.0>} 62 | 6> % Check that the RAFT server started successfully 63 | wa_raft_server:status(raft_server_test_1). 64 | [{state,stalled}, 65 | {id,nonode@nohost}, 66 | {table,test}, 67 | {partition,1}, 68 | {data_dir,"./test.1"}, 69 | {current_term,0}, 70 | {voted_for,undefined}, 71 | {commit_index,0}, 72 | {last_applied,0}, 73 | {leader_id,undefined}, 74 | {next_index,#{}}, 75 | {match_index,#{}}, 76 | {log_module,wa_raft_log_ets}, 77 | {log_first,0}, 78 | {log_last,0}, 79 | {votes,#{}}, 80 | {inflight_applies,0}, 81 | {disable_reason,undefined}, 82 | {config,#{version => 1,membership => [],witness => []}}, 83 | {config_index,0}, 84 | {witness,false}] 85 | 7> % Make a cluster configuration with the current node as the only member 86 | Config = wa_raft_server:make_config([#raft_identity{name = raft_server_test_1, node = node()}]). 87 | #{version => 1, 88 | membership => [{raft_server_test_1,nonode@nohost}], 89 | witness => []} 90 | 8> % Bootstrap the RAFT server to get it started 91 | wa_raft_server:bootstrap(raft_server_test_1, #raft_log_pos{index = 1, term = 1}, Config, #{}). 92 | ok 93 | 9> % Wait for the RAFT server to become the leader 94 | wa_raft_server:status(raft_server_test_1). 95 | [{state,leader}, 96 | {id,nonode@nohost}, 97 | {table,test}, 98 | {partition,1}, 99 | {data_dir,"./test.1"}, 100 | {current_term,1}, 101 | {voted_for,nonode@nohost}, 102 | {commit_index,2}, 103 | {last_applied,2}, 104 | {leader_id,nonode@nohost}, 105 | {next_index,#{}}, 106 | {match_index,#{}}, 107 | {log_module,wa_raft_log_ets}, 108 | {log_first,1}, 109 | {log_last,2}, 110 | {votes,#{}}, 111 | {inflight_applies,0}, 112 | {disable_reason,undefined}, 113 | {config,#{version => 1, 114 | membership => [{raft_server_test_1,nonode@nohost}], 115 | witness => []}}, 116 | {config_index,1}, 117 | {witness,false}] 118 | 10> % Read and write against a key 119 | wa_raft_acceptor:commit(raft_acceptor_test_1, {make_ref(), {write, test, key, 1000}}). 120 | ok 121 | 11> wa_raft_acceptor:read(raft_acceptor_test_1, {read, test, key}). 122 | {ok,1000} 123 | ``` 124 | 125 | The [example directory](https://github.com/WhatsApp/waraft/tree/main/examples/kvstore/src) contains an example generic key-value store built on top of WARaft. 126 | 127 | ## License 128 | 129 | WARaft is [Apache licensed](./LICENSE). 130 | -------------------------------------------------------------------------------- /src/wa_raft_durable_state.erl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | %%% 6 | %%% This module implements functions for storing / loading persistent state. 7 | 8 | -module(wa_raft_durable_state). 9 | -compile(warn_missing_spec_all). 10 | 11 | -include_lib("wa_raft/include/wa_raft.hrl"). 12 | -include_lib("wa_raft/include/wa_raft_logger.hrl"). 13 | 14 | -export([ 15 | load/1, 16 | store/1, 17 | sync/1 18 | ]). 19 | 20 | -spec load(StateIn :: #raft_state{}) -> {ok, StateOut :: #raft_state{}} | no_state | {error, Reason :: term()}. 21 | load(#raft_state{name = Name, partition_path = PartitionPath} = State) -> 22 | StateItems = [ 23 | {current_term, fun is_integer/1, fun (V, S) -> S#raft_state{current_term = V} end, required}, 24 | {voted_for, fun is_atom/1, fun (V, S) -> S#raft_state{voted_for = V} end, required}, 25 | {disable_reason, undefined, fun (V, S) -> S#raft_state{disable_reason = V} end, undefined} 26 | ], 27 | StateFile = filename:join(PartitionPath, ?STATE_FILE_NAME), 28 | case file:consult(StateFile) of 29 | {ok, [{crc, CRC} | StateTerms]} -> 30 | case erlang:crc32(term_to_binary(StateTerms, [{minor_version, 1}, deterministic])) of 31 | CRC -> 32 | try 33 | {ok, lists:foldl( 34 | fun ({Item, Validator, Updater, Default}, StateN) -> 35 | case proplists:lookup(Item, StateTerms) of 36 | none when Default =:= required -> 37 | ?RAFT_LOG_ERROR("~p read state file but cannot find ~p.", [Name, Item]), 38 | throw({error, {missing, Item}}); 39 | none -> 40 | Updater(Default, StateN); 41 | {Item, Value} -> 42 | case Validator =:= undefined orelse Validator(Value) of 43 | true -> 44 | Updater(Value, StateN); 45 | false -> 46 | ?RAFT_LOG_ERROR("~p read state file but ~p has an invalid value `~p`.", [Name, Item, Value]), 47 | throw({error, {invalid, Item}}) 48 | end 49 | end 50 | end, State, StateItems)} 51 | catch 52 | throw:{error, Reason} -> {error, Reason} 53 | end; 54 | InvalidCRC -> 55 | ?RAFT_LOG_ERROR("~p read state file but CRCs did not match. (saved crc: ~p, computed crc: ~p)", [Name, InvalidCRC, CRC]), 56 | {error, invalid_crc} 57 | end; 58 | {ok, _} -> 59 | ?RAFT_LOG_ERROR("~p read state file but no CRC was found", [Name]), 60 | {error, no_crc}; 61 | {error, enoent} -> 62 | ?RAFT_LOG_NOTICE("~p is not loading non-existant state file.", [Name]), 63 | no_state; 64 | {error, Reason} -> 65 | ?RAFT_LOG_ERROR("~p could not read state file due to ~p.", [Name, Reason]), 66 | {error, Reason} 67 | end. 68 | 69 | -spec store(#raft_state{}) -> ok | {error, Reason :: term()}. 70 | store(#raft_state{name = Name, partition_path = PartitionPath, current_term = CurrentTerm, voted_for = VotedFor, disable_reason = DisableReason}) -> 71 | StateList = [ 72 | {current_term, CurrentTerm}, 73 | {voted_for, VotedFor}, 74 | {disable_reason, DisableReason} 75 | ], 76 | StateListWithCRC = [{crc, erlang:crc32(term_to_binary(StateList, [{minor_version, 1}, deterministic]))} | StateList], 77 | StateIO = [io_lib:format("~p.~n", [Term]) || Term <- StateListWithCRC], 78 | StateFile = filename:join(PartitionPath, ?STATE_FILE_NAME), 79 | StateFileTemp = [StateFile, ".temp"], 80 | case filelib:ensure_dir(StateFile) of 81 | ok -> 82 | case prim_file:write_file(StateFileTemp, StateIO) of 83 | ok -> 84 | case file:rename(StateFileTemp, StateFile) of 85 | ok -> 86 | ok; 87 | {error, Reason} -> 88 | ?RAFT_COUNT({'raft.server.persist_state.error.rename', Reason}), 89 | ?RAFT_LOG_ERROR("~p failed to rename temporary state file due to ~p.", [Name, Reason]), 90 | {error, {rename, Reason}} 91 | end; 92 | {error, Reason} -> 93 | ?RAFT_COUNT({'raft.server.persist_state.error.write', Reason}), 94 | ?RAFT_LOG_ERROR("~p failed to write current state to temporary file due to ~p.", [Name, Reason]), 95 | {error, {write, Reason}} 96 | end; 97 | {error, Reason} -> 98 | ?RAFT_COUNT({'raft.server.persist_state.error.ensure_dir', Reason}), 99 | ?RAFT_LOG_ERROR("~p failed to ensure directory exists due to ~p.", [Name, Reason]), 100 | {error, {ensure_dir, Reason}} 101 | end. 102 | 103 | -spec sync(StateIn :: #raft_state{}) -> ok. 104 | sync(#raft_state{partition_path = PartitionPath}) -> 105 | StateFile = filename:join(PartitionPath, ?STATE_FILE_NAME), 106 | case prim_file:open(StateFile, [read, binary]) of 107 | {ok, Fd} -> 108 | prim_file:sync(Fd), 109 | prim_file:close(Fd), 110 | ok; 111 | _ -> 112 | ok 113 | end. 114 | -------------------------------------------------------------------------------- /include/wa_raft_rpc.hrl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | %%% 6 | %%% This file contains macros defining the form of all RPCs and API 7 | %%% calls used as part of the RAFT protocol and RAFT server and storage API. 8 | 9 | %%------------------------------------------------------------------- 10 | %% RAFT Server RPC Formats 11 | %%------------------------------------------------------------------- 12 | %% As the RAFT process that is intended to performs the cross-node 13 | %% communication required to provide durability against failure, 14 | %% RAFT servers across nodes must agree on the RPC formats in use. 15 | %% This means that RPC formats should not be changed once created. 16 | %%------------------------------------------------------------------- 17 | 18 | -define(RAFT_NAMED_RPC(Type, Term, SenderName, SenderNode, Payload), {rpc, Type, Term, SenderName, SenderNode, Payload}). 19 | 20 | %% These two RPCs are used by RAFT catchup to receive the status of 21 | %% the RAFT server being sent to and should not change. 22 | -define(LEGACY_RAFT_RPC(Type, Term, SenderId, Payload), {rpc, Type, Term, SenderId, Payload}). 23 | -define(LEGACY_APPEND_ENTRIES_RESPONSE_RPC(Term, SenderId, PrevLogIndex, Success, LastIndex), 24 | ?LEGACY_RAFT_RPC(append_entries_response, Term, SenderId, {PrevLogIndex, Success, LastIndex})). 25 | 26 | %%------------------------------------------------------------------- 27 | %% RAFT Server Procedures 28 | %%------------------------------------------------------------------- 29 | %% An RPC received from a peer is intended to trigger one of the 30 | %% procedures listed below. 31 | %%------------------------------------------------------------------- 32 | 33 | -define(APPEND_ENTRIES, append_entries). 34 | -define(APPEND_ENTRIES_RESPONSE, append_entries_response). 35 | -define(REQUEST_VOTE, request_vote). 36 | -define(VOTE, vote). 37 | -define(HANDOVER, handover). 38 | -define(HANDOVER_FAILED, handover_failed). 39 | -define(NOTIFY_TERM, notify_term). 40 | 41 | %% Definitions of each of the standard procedures. 42 | -define(PROCEDURE(Type, Payload), {procedure, Type, Payload}). 43 | -define(APPEND_ENTRIES(PrevLogIndex, PrevLogTerm, Entries, CommitIndex, TrimIndex), ?PROCEDURE(?APPEND_ENTRIES, {PrevLogIndex, PrevLogTerm, Entries, CommitIndex, TrimIndex})). 44 | -define(APPEND_ENTRIES_RESPONSE(PrevLogIndex, Success, MatchIndex, LastAppliedIndex), ?PROCEDURE(?APPEND_ENTRIES_RESPONSE, {PrevLogIndex, Success, MatchIndex, LastAppliedIndex})). 45 | -define(REQUEST_VOTE(ElectionType, LastLogIndex, LastLogTerm), ?PROCEDURE(?REQUEST_VOTE, {ElectionType, LastLogIndex, LastLogTerm})). 46 | -define(VOTE(Vote), ?PROCEDURE(?VOTE, {Vote})). 47 | -define(HANDOVER(Ref, PrevLogIndex, PrevLogTerm, Entries), ?PROCEDURE(?HANDOVER, {Ref, PrevLogIndex, PrevLogTerm, Entries})). 48 | -define(HANDOVER_FAILED(Ref), ?PROCEDURE(?HANDOVER_FAILED, {Ref})). 49 | -define(NOTIFY_TERM(), ?PROCEDURE(?NOTIFY_TERM, {})). 50 | 51 | %% A request to execute a particular procedure. This request could 52 | %% have been issued locally or as a result of a remote procedure 53 | %% call. The peer (if exists and could be oneself) that issued the 54 | %% procedure call will be provided as the sender. 55 | -define(REMOTE(Sender, Call), {remote, Sender, Call}). 56 | 57 | %%------------------------------------------------------------------- 58 | %% RAFT Server Internal Events 59 | %%------------------------------------------------------------------- 60 | %% An event produced internally within the RAFT server. 61 | %%------------------------------------------------------------------- 62 | 63 | -define(ADVANCE_TERM(Term), {advance_term, Term}). 64 | -define(FORCE_ELECTION(Term), {force_election, Term}). 65 | 66 | %%------------------------------------------------------------------- 67 | %% RAFT Server API 68 | %%------------------------------------------------------------------- 69 | %% The RAFT server also accepts commands issued from other processes 70 | %% on the local node. These commands are not guaranteed to have the 71 | %% same format between versions and so should only be used locally. 72 | %% Prefer to use `wa_raft_server` module exports when possible. 73 | %%------------------------------------------------------------------- 74 | 75 | -define(RAFT_COMMAND(Type, Payload), {command, Type, Payload}). 76 | 77 | -define(COMMIT_COMMAND(From, Op, Priority), ?RAFT_COMMAND(commit, {From, Op, Priority})). 78 | -define(READ_COMMAND(Op), ?RAFT_COMMAND(read, Op)). 79 | 80 | -define(CURRENT_CONFIG_COMMAND, ?RAFT_COMMAND(current_config, undefined)). 81 | -define(STATUS_COMMAND, ?RAFT_COMMAND(status, undefined)). 82 | -define(TRIGGER_ELECTION_COMMAND(TermOrOffset), ?RAFT_COMMAND(trigger_election, {TermOrOffset})). 83 | -define(PROMOTE_COMMAND(TermOrOffset, Force), ?RAFT_COMMAND(promote, {TermOrOffset, Force})). 84 | -define(RESIGN_COMMAND, ?RAFT_COMMAND(resign, undefined)). 85 | 86 | -define(ADJUST_CONFIG_COMMAND(Action, Index), ?RAFT_COMMAND(adjust_config, {Action, Index})). 87 | -define(REFRESH_CONFIG_COMMAND(), ?RAFT_COMMAND(refresh_config, undefined)). 88 | 89 | -define(SNAPSHOT_AVAILABLE_COMMAND(Root, Position), ?RAFT_COMMAND(snapshot_available, {Root, Position})). 90 | 91 | -define(HANDOVER_CANDIDATES_COMMAND, ?RAFT_COMMAND(handover_candidates, undefined)). 92 | -define(HANDOVER_COMMAND(Peer), ?RAFT_COMMAND(handover, Peer)). 93 | 94 | -define(ENABLE_COMMAND, ?RAFT_COMMAND(enable, undefined)). 95 | -define(DISABLE_COMMAND(Reason), ?RAFT_COMMAND(disable, Reason)). 96 | 97 | -define(BOOTSTRAP_COMMAND(Position, Config, Data), ?RAFT_COMMAND(bootstrap, {Position, Config, Data})). 98 | 99 | -define(NOTIFY_COMPLETE_COMMAND(), ?RAFT_COMMAND(notify_complete, undefined)). 100 | -------------------------------------------------------------------------------- /src/wa_raft_info.erl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | %%% 6 | %%% API for accessing certain useful information about the state of local 7 | %%% RAFT servers without requiring a status request against the RAFT server 8 | %%% itself. 9 | 10 | -module(wa_raft_info). 11 | -compile(warn_missing_spec_all). 12 | 13 | %% Public API 14 | -export([ 15 | get_current_term/2, 16 | get_leader/2, 17 | get_current_term_and_leader/2, 18 | get_membership/2, 19 | get_live/2, 20 | get_stale/2, 21 | get_state/2, 22 | get_message_queue_length/1 23 | ]). 24 | 25 | %% Internal API 26 | -export([ 27 | init_tables/0, 28 | delete_state/2, 29 | set_current_term_and_leader/4, 30 | set_membership/3, 31 | set_live/3, 32 | set_stale/3, 33 | set_state/3, 34 | set_message_queue_length/1, 35 | set_message_queue_length/2 36 | ]). 37 | 38 | %% Local RAFT server's current FSM state 39 | -define(RAFT_SERVER_STATE_KEY(Table, Partition), {state, Table, Partition}). 40 | %% Local RAFT server's most recently known term and leader 41 | -define(RAFT_CURRENT_TERM_AND_LEADER_KEY(Table, Partition), {term, Table, Partition}). 42 | %% Local RAFT server's current live flag - indicates if the server thinks it is part of a live cluster 43 | -define(RAFT_LIVE_KEY(Table, Partition), {live, Table, Partition}). 44 | %% Local RAFT server's current stale flag - indicates if the server thinks its data is stale 45 | -define(RAFT_STALE_KEY(Table, Partition), {stale, Table, Partition}). 46 | %% Local RAFT server's message queue length 47 | -define(RAFT_MSG_QUEUE_LENGTH_KEY(Name), {msg_queue_length, Name}). 48 | %% Local RAFT server's most recently known membership 49 | -define(RAFT_MEMBERSHIP_KEY(Table, Partition), {membership, Table, Partition}). 50 | 51 | %%------------------------------------------------------------------- 52 | %% RAFT Info - Public API 53 | %%------------------------------------------------------------------- 54 | 55 | -spec get(term(), Default) -> Default. 56 | get(Key, Default) -> 57 | try 58 | ets:lookup_element(?MODULE, Key, 2, Default) 59 | catch 60 | error:badarg -> 61 | Default 62 | end. 63 | 64 | -spec get_leader(wa_raft:table(), wa_raft:partition()) -> node() | undefined. 65 | get_leader(Table, Partition) -> 66 | {_, Leader} = get(?RAFT_CURRENT_TERM_AND_LEADER_KEY(Table, Partition), {undefined, undefined}), 67 | Leader. 68 | 69 | -spec get_current_term(wa_raft:table(), wa_raft:partition()) -> wa_raft_log:log_term() | undefined. 70 | get_current_term(Table, Partition) -> 71 | {Term, _} = get(?RAFT_CURRENT_TERM_AND_LEADER_KEY(Table, Partition), {undefined, undefined}), 72 | Term. 73 | 74 | %% The RAFT server always sets both the known term and leader together, so that 75 | %% the atomic read performed by this method will not return a known leader for 76 | %% a different term. 77 | -spec get_current_term_and_leader(wa_raft:table(), wa_raft:partition()) -> 78 | {wa_raft_log:log_term() | undefined, node() | undefined}. 79 | get_current_term_and_leader(Table, Partition) -> 80 | get(?RAFT_CURRENT_TERM_AND_LEADER_KEY(Table, Partition), {undefined, undefined}). 81 | 82 | -spec get_state(wa_raft:table(), wa_raft:partition()) -> wa_raft_server:state() | undefined. 83 | get_state(Table, Partition) -> 84 | get(?RAFT_SERVER_STATE_KEY(Table, Partition), undefined). 85 | 86 | -spec get_live(wa_raft:table(), wa_raft:partition()) -> boolean(). 87 | get_live(Table, Partition) -> 88 | get(?RAFT_LIVE_KEY(Table, Partition), false). 89 | 90 | -spec get_stale(wa_raft:table(), wa_raft:partition()) -> boolean(). 91 | get_stale(Table, Partition) -> 92 | get(?RAFT_STALE_KEY(Table, Partition), true). 93 | 94 | -spec get_message_queue_length(atom()) -> undefined | non_neg_integer(). 95 | get_message_queue_length(Name) -> 96 | get(?RAFT_MSG_QUEUE_LENGTH_KEY(Name), undefined). 97 | 98 | -spec get_membership(wa_raft:table(), wa_raft:partition()) -> wa_raft_server:membership() | undefined. 99 | get_membership(Table, Partition) -> 100 | get(?RAFT_MEMBERSHIP_KEY(Table, Partition), undefined). 101 | 102 | %%------------------------------------------------------------------- 103 | %% RAFT Info - Internal API 104 | %%------------------------------------------------------------------- 105 | 106 | -spec init_tables() -> ok. 107 | init_tables() -> 108 | ets:new(?MODULE, [set, public, named_table, {write_concurrency, true}, {read_concurrency, true}]), 109 | ok. 110 | 111 | -spec set(term(), term()) -> true. 112 | set(Key, Value) -> 113 | ets:update_element(?MODULE, Key, {2, Value}) orelse ets:insert(?MODULE, {Key, Value}). 114 | 115 | -spec delete(term()) -> true. 116 | delete(Key) -> 117 | ets:delete(?MODULE, Key). 118 | 119 | -spec set_current_term_and_leader(wa_raft:table(), wa_raft:partition(), wa_raft_log:log_term(), node()) -> true. 120 | set_current_term_and_leader(Table, Partition, Term, Leader) -> 121 | set(?RAFT_CURRENT_TERM_AND_LEADER_KEY(Table, Partition), {Term, Leader}). 122 | 123 | -spec set_state(wa_raft:table(), wa_raft:partition(), wa_raft_server:state()) -> true. 124 | set_state(Table, Partition, State) -> 125 | set(?RAFT_SERVER_STATE_KEY(Table, Partition), State). 126 | 127 | -spec delete_state(wa_raft:table(), wa_raft:partition()) -> true. 128 | delete_state(Table, Partition) -> 129 | delete(?RAFT_SERVER_STATE_KEY(Table, Partition)). 130 | 131 | -spec set_live(wa_raft:table(), wa_raft:partition(), boolean()) -> true. 132 | set_live(Table, Partition, Live) -> 133 | set(?RAFT_LIVE_KEY(Table, Partition), Live). 134 | 135 | -spec set_stale(wa_raft:table(), wa_raft:partition(), boolean()) -> true. 136 | set_stale(Table, Partition, Stale) -> 137 | set(?RAFT_STALE_KEY(Table, Partition), Stale). 138 | 139 | -spec set_membership(wa_raft:table(), wa_raft:partition(), wa_raft_server:membership()) -> true. 140 | set_membership(Table, Partition, Membership) -> 141 | set(?RAFT_MEMBERSHIP_KEY(Table, Partition), Membership). 142 | 143 | -spec set_message_queue_length(Name :: atom()) -> true. 144 | set_message_queue_length(Name) -> 145 | {message_queue_len, Length} = process_info(self(), message_queue_len), 146 | set_message_queue_length(Name, Length). 147 | 148 | -spec set_message_queue_length(Name :: atom(), Length :: non_neg_integer()) -> true. 149 | set_message_queue_length(Name, Length) -> 150 | set(?RAFT_MSG_QUEUE_LENGTH_KEY(Name), Length). 151 | -------------------------------------------------------------------------------- /src/wa_raft_transport_cleanup.erl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | 6 | -module(wa_raft_transport_cleanup). 7 | -compile(warn_missing_spec_all). 8 | -behaviour(gen_server). 9 | 10 | -include_lib("wa_raft/include/wa_raft.hrl"). 11 | -include_lib("wa_raft/include/wa_raft_logger.hrl"). 12 | 13 | %% OTP supervision 14 | -export([ 15 | child_spec/1, 16 | start_link/1 17 | ]). 18 | 19 | %% Internal API 20 | -export([ 21 | default_name/2, 22 | registered_name/2 23 | ]). 24 | 25 | %% Server Callbacks 26 | -export([ 27 | init/1, 28 | handle_call/3, 29 | handle_cast/2, 30 | handle_info/2 31 | ]). 32 | 33 | -define(RAFT_TRANSPORT_CLEANUP_SCAN_INTERVAL_SECS, 30). 34 | 35 | -record(state, { 36 | application :: atom(), 37 | name :: atom(), 38 | directory :: file:filename() 39 | }). 40 | 41 | %%------------------------------------------------------------------- 42 | %% OTP Supervision 43 | %%------------------------------------------------------------------- 44 | 45 | -spec child_spec(Options :: #raft_options{}) -> supervisor:child_spec(). 46 | child_spec(Options) -> 47 | #{ 48 | id => ?MODULE, 49 | start => {?MODULE, start_link, [Options]}, 50 | restart => permanent, 51 | shutdown => 5000, 52 | modules => [?MODULE] 53 | }. 54 | 55 | -spec start_link(Options :: #raft_options{}) -> gen_server:start_ret(). 56 | start_link(#raft_options{transport_cleanup_name = Name} = Options) -> 57 | gen_server:start_link({local, Name}, ?MODULE, Options, []). 58 | 59 | %%------------------------------------------------------------------- 60 | %% Internal API 61 | %%------------------------------------------------------------------- 62 | 63 | %% Get the default name for the RAFT acceptor server associated with the 64 | %% provided RAFT partition. 65 | -spec default_name(Table :: wa_raft:table(), Partition :: wa_raft:partition()) -> Name :: atom(). 66 | default_name(Table, Partition) -> 67 | list_to_atom("raft_transport_cleanup_" ++ atom_to_list(Table) ++ "_" ++ integer_to_list(Partition)). 68 | 69 | %% Get the registered name for the RAFT acceptor server associated with the 70 | %% provided RAFT partition or the default name if no registration exists. 71 | -spec registered_name(Table :: wa_raft:table(), Partition :: wa_raft:partition()) -> Name :: atom(). 72 | registered_name(Table, Partition) -> 73 | case wa_raft_part_sup:options(Table, Partition) of 74 | undefined -> default_name(Table, Partition); 75 | Options -> Options#raft_options.transport_cleanup_name 76 | end. 77 | 78 | %%------------------------------------------------------------------- 79 | %% Server Callbacks 80 | %%------------------------------------------------------------------- 81 | 82 | -spec init(Options :: #raft_options{}) -> {ok, State :: #state{}}. 83 | init(#raft_options{application = Application, transport_directory = Directory, transport_cleanup_name = Name}) -> 84 | process_flag(trap_exit, true), 85 | schedule_scan(), 86 | {ok, #state{application = Application, name = Name, directory = Directory}}. 87 | 88 | -spec handle_call(Request :: term(), From :: gen_server:from(), State :: #state{}) -> {noreply, NewState :: #state{}}. 89 | handle_call(Request, From, #state{name = Name} = State) -> 90 | ?RAFT_LOG_WARNING("~p received unrecognized call ~0P from ~0p", [Name, Request, 25, From]), 91 | {noreply, State}. 92 | 93 | -spec handle_cast(Request :: term(), State :: #state{}) -> {noreply, NewState :: #state{}}. 94 | handle_cast(Request, #state{name = Name} = State) -> 95 | ?RAFT_LOG_NOTICE("~p got unrecognized cast ~0P", [Name, Request, 25]), 96 | {noreply, State}. 97 | 98 | -spec handle_info(Info :: term(), State :: #state{}) -> {noreply, NewState :: #state{}}. 99 | handle_info(scan, #state{} = State) -> 100 | maybe_cleanup(State), 101 | schedule_scan(), 102 | {noreply, State}; 103 | handle_info(Info, #state{name = Name} = State) -> 104 | ?RAFT_LOG_NOTICE("~p got unrecognized info ~p", [Name, Info]), 105 | {noreply, State}. 106 | 107 | -spec maybe_cleanup(State :: #state{}) -> ok | {error, term()}. 108 | maybe_cleanup(#state{application = App, name = Name, directory = Directory} = State) -> 109 | case prim_file:list_dir(Directory) of 110 | {ok, Files} -> 111 | RetainMillis = ?RAFT_TRANSPORT_RETAIN_INTERVAL(App) * 1000, 112 | NowMillis = erlang:system_time(millisecond), 113 | lists:foreach( 114 | fun (Filename) -> 115 | Path = filename:join(Directory, Filename), 116 | ID = list_to_integer(Filename), 117 | case wa_raft_transport:transport_info(ID) of 118 | {ok, #{end_ts := EndTs}} when NowMillis - EndTs > RetainMillis -> 119 | ?RAFT_LOG_NOTICE( 120 | "~p deleting ~p due to expiring after transport ended", 121 | [Name, Filename] 122 | ), 123 | cleanup(ID, Path, State); 124 | {ok, _Info} -> 125 | ok; 126 | not_found -> 127 | ?RAFT_LOG_NOTICE( 128 | "~p deleting ~p due to having no associated transport", 129 | [Name, Filename] 130 | ), 131 | cleanup(ID, Path, State) 132 | end 133 | end, Files); 134 | {error, enoent} -> 135 | ok; 136 | {error, Reason} -> 137 | ?RAFT_LOG_WARNING( 138 | "~p failed to list transports for cleanup due to ~p", 139 | [Name, Reason] 140 | ), 141 | {error, Reason} 142 | end. 143 | 144 | -spec cleanup(non_neg_integer(), string(), #state{}) -> ok | {error, term()}. 145 | cleanup(ID, Path, #state{name = Name}) -> 146 | case file:del_dir_r(Path) of 147 | ok -> 148 | ok; 149 | {error, Reason} -> 150 | ?RAFT_LOG_WARNING( 151 | "~p failed to cleanup transport ~p due to ~p", 152 | [Name, ID, Reason] 153 | ), 154 | {error, Reason} 155 | end. 156 | 157 | -spec schedule_scan() -> reference(). 158 | schedule_scan() -> 159 | erlang:send_after(?RAFT_TRANSPORT_CLEANUP_SCAN_INTERVAL_SECS * 1000, self(), scan). 160 | -------------------------------------------------------------------------------- /src/wa_raft_transport_worker.erl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | 6 | -module(wa_raft_transport_worker). 7 | -compile(warn_missing_spec_all). 8 | -behaviour(gen_server). 9 | 10 | -include_lib("wa_raft/include/wa_raft.hrl"). 11 | -include_lib("wa_raft/include/wa_raft_logger.hrl"). 12 | 13 | %% OTP supervision 14 | -export([ 15 | child_spec/2, 16 | start_link/2 17 | ]). 18 | 19 | %% gen_server callbacks 20 | -export([ 21 | init/1, 22 | handle_call/3, 23 | handle_cast/2, 24 | handle_info/2, 25 | terminate/2 26 | ]). 27 | 28 | -define(CONTINUE_TIMEOUT, 0). 29 | 30 | -record(state, { 31 | node :: node(), 32 | number :: non_neg_integer(), 33 | jobs = queue:new() :: queue:queue(job()), 34 | states = #{} :: #{module() => state()} 35 | }). 36 | -type state() :: #state{}. 37 | 38 | -record(transport, { 39 | id :: wa_raft_transport:transport_id() 40 | }). 41 | -record(file, { 42 | id :: wa_raft_transport:transport_id(), 43 | file :: wa_raft_transport:file_id() 44 | }). 45 | -type job() :: #transport{} | #file{}. 46 | 47 | %%% ------------------------------------------------------------------------ 48 | %%% OTP supervision callbacks 49 | %%% 50 | 51 | -spec child_spec(Node :: node(), Number :: non_neg_integer()) -> supervisor:child_spec(). 52 | child_spec(Node, Number) -> 53 | #{ 54 | id => {?MODULE, Node, Number}, 55 | start => {?MODULE, start_link, [Node, Number]}, 56 | restart => permanent, 57 | shutdown => 5000, 58 | modules => [?MODULE] 59 | }. 60 | 61 | -spec start_link(Node :: node(), Number :: non_neg_integer()) -> gen_server:start_ret(). 62 | start_link(Node, Number) -> 63 | gen_server:start_link(?MODULE, {Node, Number}, []). 64 | 65 | %%% ------------------------------------------------------------------------ 66 | %%% gen_server callbacks 67 | %%% 68 | 69 | -spec init(Args :: {node(), non_neg_integer()}) -> {ok, State :: state(), Timeout :: timeout()}. 70 | init({Node, Number}) -> 71 | {ok, #state{node = Node, number = Number}, ?CONTINUE_TIMEOUT}. 72 | 73 | -spec handle_call(Request :: term(), From :: {Pid :: pid(), Tag :: term()}, State :: state()) -> 74 | {noreply, NewState :: state(), Timeout :: timeout()}. 75 | handle_call(Request, From, #state{number = Number} = State) -> 76 | ?RAFT_LOG_WARNING("[~p] received unrecognized call ~p from ~p", [Number, Request, From]), 77 | {noreply, State, ?CONTINUE_TIMEOUT}. 78 | 79 | -spec handle_cast(Request, State :: state()) -> {noreply, NewState :: state(), Timeout :: timeout()} 80 | when Request :: {notify, wa_raft_transport:transport_id()}. 81 | handle_cast({notify, ID}, #state{jobs = Jobs} = State) -> 82 | {noreply, State#state{jobs = queue:in(#transport{id = ID}, Jobs)}, ?CONTINUE_TIMEOUT}; 83 | handle_cast(Request, #state{number = Number} = State) -> 84 | ?RAFT_LOG_WARNING("[~p] received unrecognized cast ~p", [Number, Request]), 85 | {noreply, State, ?CONTINUE_TIMEOUT}. 86 | 87 | -spec handle_info(Info :: term(), State :: state()) -> 88 | {noreply, NewState :: state()} 89 | | {noreply, NewState :: state(), Timeout :: timeout() | hibernate}. 90 | handle_info(timeout, #state{number = Number, jobs = Jobs, states = States} = State) -> 91 | case queue:out(Jobs) of 92 | {empty, NewJobs} -> 93 | {noreply, State#state{jobs = NewJobs}, hibernate}; 94 | {{value, #transport{id = ID}}, NewJobs} -> 95 | case wa_raft_transport:pop_file(ID) of 96 | {ok, FileID} -> 97 | ?RAFT_COUNT('raft.transport.file.send'), 98 | wa_raft_transport:update_file_info(ID, FileID, 99 | fun (Info) -> Info#{status => sending, start_ts => erlang:system_time(millisecond)} end), 100 | NewJob = #file{id = ID, file = FileID}, 101 | {noreply, State#state{jobs = queue:in(NewJob, NewJobs)}, ?CONTINUE_TIMEOUT}; 102 | _Other -> 103 | {noreply, State#state{jobs = NewJobs}, ?CONTINUE_TIMEOUT} 104 | end; 105 | {{value, #file{id = ID, file = FileID} = Job}, NewJobs} -> 106 | {Result, NewState} = case wa_raft_transport:transport_info(ID) of 107 | {ok, #{module := Module}} -> 108 | try get_module_state(Module, State) of 109 | {ok, ModuleState0} -> 110 | try Module:transport_send(ID, FileID, ModuleState0) of 111 | {ok, ModuleState1} -> 112 | {ok, State#state{states = States#{Module => ModuleState1}}}; 113 | {continue, ModuleState1} -> 114 | {continue, State#state{states = States#{Module => ModuleState1}}}; 115 | {stop, Reason, ModuleState1} -> 116 | {{stop, Reason}, State#state{states = States#{Module => ModuleState1}}} 117 | catch 118 | T:E:S -> 119 | ?RAFT_LOG_WARNING( 120 | "[~p] module ~p failed to send file ~p:~p due to ~p ~p: ~p", 121 | [Number, Module, ID, FileID, T, E, S] 122 | ), 123 | {{T, E}, State} 124 | end; 125 | Other -> 126 | {Other, State} 127 | catch 128 | T:E:S -> 129 | ?RAFT_LOG_WARNING( 130 | "[~p] module ~p failed to get/init module state due to ~p ~p: ~p", 131 | [Number, Module, T, E, S] 132 | ), 133 | {{T, E}, State} 134 | end; 135 | _ -> 136 | ?RAFT_LOG_WARNING("[~p] trying to send for unknown transfer ~p", [Number, ID]), 137 | {{stop, invalid_transport}, State} 138 | end, 139 | case Result =:= continue of 140 | true -> 141 | {noreply, NewState#state{jobs = queue:in(Job, NewJobs)}, ?CONTINUE_TIMEOUT}; 142 | false -> 143 | wa_raft_transport:complete(ID, FileID, Result), 144 | {noreply, NewState#state{jobs = queue:in(#transport{id = ID}, NewJobs)}, ?CONTINUE_TIMEOUT} 145 | end 146 | end; 147 | handle_info(Info, #state{number = Number} = State) -> 148 | ?RAFT_LOG_WARNING("[~p] received unrecognized info ~p", [Number, Info]), 149 | {noreply, State, ?CONTINUE_TIMEOUT}. 150 | 151 | -spec terminate(term(), state()) -> ok. 152 | terminate(Reason, #state{states = States}) -> 153 | [ 154 | case erlang:function_exported(Module, transport_terminate, 2) of 155 | true -> Module:transport_terminate(Reason, State); 156 | false -> ok 157 | end 158 | || Module := State <- States 159 | ], 160 | ok. 161 | 162 | -spec get_module_state(module(), state()) -> {ok, state()} | {stop, term()}. 163 | get_module_state(Module, #state{node = Node, states = States}) -> 164 | case States of 165 | #{Module := ModuleState} -> {ok, ModuleState}; 166 | _ -> Module:transport_init(Node) 167 | end. 168 | -------------------------------------------------------------------------------- /src/wa_raft_sup.erl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | %%% 6 | %%% Supervisor for supervising RAFT partitions started by a client application. 7 | %%% As a `simple_one_for_one` supervisor, this supervisor can dynamically 8 | %%% start and stop partitions and will stop partitions in parallel during 9 | %%% shutdown. 10 | 11 | -module(wa_raft_sup). 12 | -compile(warn_missing_spec_all). 13 | -behaviour(supervisor). 14 | 15 | %% OTP supervision 16 | -export([ 17 | child_spec/1, 18 | child_spec/2, 19 | child_spec/3, 20 | start_link/3 21 | ]). 22 | 23 | %% API 24 | -export([ 25 | start_partition/2, 26 | start_partition_under_application/2, 27 | stop_partition/2, 28 | stop_partition/3, 29 | stop_partition_under_application/2, 30 | stop_partition_under_application/3 31 | ]). 32 | 33 | %% Internal API 34 | -export([ 35 | default_name/1, 36 | default_config_apps/1, 37 | registered_config_apps/1 38 | ]). 39 | 40 | %% Internal API 41 | -export([ 42 | options/1 43 | ]). 44 | 45 | %% Test API 46 | -export([ 47 | prepare_application/1, 48 | prepare_application/2 49 | ]). 50 | 51 | %% Supervisor callbacks 52 | -export([ 53 | init/1 54 | ]). 55 | 56 | -include_lib("wa_raft/include/wa_raft.hrl"). 57 | 58 | %% Key in persistent_term for the application options associated with an 59 | %% application that has started a RAFT supervisor. 60 | -define(OPTIONS_KEY(Application), {?MODULE, Application}). 61 | 62 | %% Options for RAFT client applications 63 | -type options() :: #{ 64 | % RAFT will search for environment variables from applications in this order 65 | config_search_apps => [atom()] 66 | }. 67 | 68 | %%------------------------------------------------------------------- 69 | %% OTP supervision 70 | %%------------------------------------------------------------------- 71 | 72 | -spec child_spec(Specs :: [wa_raft:args()]) -> supervisor:child_spec(). 73 | child_spec(Specs) -> 74 | {ok, Application} = application:get_application(), 75 | child_spec(Application, Specs, #{}). 76 | 77 | -spec child_spec(Application :: atom(), Specs :: [wa_raft:args()]) -> supervisor:child_spec(). 78 | child_spec(Application, RaftArgs) when is_list(RaftArgs) -> 79 | child_spec(Application, RaftArgs, #{}); 80 | child_spec(RaftArgs, Options) -> 81 | {ok, Application} = application:get_application(), 82 | child_spec(Application, RaftArgs, Options). 83 | 84 | -spec child_spec(Application :: atom(), Specs :: [wa_raft:args()], Options :: options()) -> supervisor:child_spec(). 85 | child_spec(Application, RaftArgs, Options) -> 86 | #{ 87 | id => ?MODULE, 88 | start => {?MODULE, start_link, [Application, RaftArgs, Options]}, 89 | restart => permanent, 90 | shutdown => infinity, 91 | type => supervisor, 92 | modules => [?MODULE] 93 | }. 94 | 95 | -spec start_link(Application :: atom(), Specs :: [wa_raft:args()], Options :: options()) -> supervisor:startlink_ret(). 96 | start_link(Application, RaftArgs, Options) -> 97 | ok = persistent_term:put(?OPTIONS_KEY(Application), normalize_spec(Application, Options)), 98 | case supervisor:start_link({local, default_name(Application)}, ?MODULE, Application) of 99 | {ok, Pid} = Result -> 100 | [ 101 | case start_partition(Pid, Spec) of 102 | {error, Reason} -> error(Reason); 103 | _Other -> ok 104 | end || Spec <- RaftArgs 105 | ], 106 | Result; 107 | Else -> 108 | Else 109 | end. 110 | 111 | %%------------------------------------------------------------------- 112 | %% API 113 | %%------------------------------------------------------------------- 114 | 115 | -spec start_partition(Supervisor :: atom() | pid(), Spec :: wa_raft:args()) -> supervisor:startchild_ret(). 116 | start_partition(Supervisor, Spec) -> 117 | supervisor:start_child(Supervisor, [Spec]). 118 | 119 | -spec start_partition_under_application(Application :: atom(), Spec :: wa_raft:args()) -> supervisor:startchild_ret(). 120 | start_partition_under_application(Application, Spec) -> 121 | start_partition(default_name(Application), Spec). 122 | 123 | -spec stop_partition(Supervisor :: atom() | pid(), Pid :: pid()) -> ok | {error, atom()}. 124 | stop_partition(Supervisor, Pid) -> 125 | supervisor:terminate_child(Supervisor, Pid). 126 | 127 | -spec stop_partition(Supervisor :: atom() | pid(), Table :: wa_raft:table(), Partition :: wa_raft:partition()) -> ok | {error, atom()}. 128 | stop_partition(Supervisor, Table, Partition) -> 129 | case whereis(wa_raft_part_sup:registered_name(Table, Partition)) of 130 | Pid when is_pid(Pid) -> stop_partition(Supervisor, Pid); 131 | _ -> {error, not_found} 132 | end. 133 | 134 | -spec stop_partition_under_application(Application :: atom(), Pid :: pid()) -> ok | {error, atom()}. 135 | stop_partition_under_application(Application, Pid) -> 136 | stop_partition(default_name(Application), Pid). 137 | 138 | -spec stop_partition_under_application(Application :: atom(), Table :: wa_raft:table(), Partition :: wa_raft:partition()) -> ok | {error, atom()}. 139 | stop_partition_under_application(Application, Table, Partition) -> 140 | stop_partition(default_name(Application), Table, Partition). 141 | 142 | %%------------------------------------------------------------------- 143 | %% Internal API 144 | %%------------------------------------------------------------------- 145 | 146 | -spec default_name(Application :: atom()) -> atom(). 147 | default_name(Application) -> 148 | list_to_atom("raft_sup_" ++ atom_to_list(Application)). 149 | 150 | -spec default_config_apps(Application :: atom()) -> [atom()]. 151 | default_config_apps(Application) -> 152 | [Application, ?RAFT_APPLICATION]. 153 | 154 | -spec registered_config_apps(Application :: atom()) -> [atom()]. 155 | registered_config_apps(Application) -> 156 | case options(Application) of 157 | undefined -> error({raft_not_started, Application}); 158 | Options -> Options#raft_application.config_search_apps 159 | end. 160 | 161 | -spec options(Application :: atom()) -> #raft_application{} | undefined. 162 | options(Application) -> 163 | persistent_term:get(?OPTIONS_KEY(Application), undefined). 164 | 165 | -spec normalize_spec(Application :: atom(), Options :: options()) -> #raft_application{}. 166 | normalize_spec(Application, Options) -> 167 | #raft_application{ 168 | name = Application, 169 | config_search_apps = maps:get(config_search_apps, Options, [Application]) 170 | }. 171 | 172 | %%------------------------------------------------------------------- 173 | %% Test API 174 | %%------------------------------------------------------------------- 175 | 176 | -spec prepare_application(Application :: atom()) -> ok. 177 | prepare_application(Application) -> 178 | prepare_application(Application, #{}). 179 | 180 | -spec prepare_application(Application :: atom(), Options :: options()) -> ok. 181 | prepare_application(Application, Options) -> 182 | RaftApplication = normalize_spec(Application, Options), 183 | ok = persistent_term:put(?OPTIONS_KEY(Application), RaftApplication). 184 | 185 | %%------------------------------------------------------------------- 186 | %% Supervisor callbacks 187 | %%------------------------------------------------------------------- 188 | 189 | -spec init(Application :: atom()) -> {ok, {supervisor:sup_flags(), [supervisor:child_spec()]}}. 190 | init(Application) -> 191 | {ok, {#{strategy => simple_one_for_one, intensity => 10, period => 1}, [wa_raft_part_sup:child_spec(Application)]}}. 192 | -------------------------------------------------------------------------------- /src/wa_raft_log_ets.erl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | %%% 6 | %%% This module is an implementation of a completely in-memory RAFT 7 | %%% log provider that uses ETS as a backing store for the log data. 8 | %%% This module is only suitable as a log provider for an fully 9 | %%% in-memory RAFT cluster and should not be used when any durability 10 | %%% guarantees are required against node shutdown. 11 | 12 | -module(wa_raft_log_ets). 13 | -compile(warn_missing_spec_all). 14 | -behaviour(wa_raft_log). 15 | 16 | %% RAFT log provider interface for accessing log data 17 | -export([ 18 | first_index/1, 19 | last_index/1, 20 | fold/6, 21 | fold_terms/5, 22 | get/2, 23 | term/2, 24 | config/1 25 | ]). 26 | 27 | %% RAFT log provider interface for writing new log data 28 | -export([ 29 | append/4 30 | ]). 31 | 32 | %% RAFT log provider interface for managing underlying RAFT log 33 | -export([ 34 | init/1, 35 | open/1, 36 | close/2, 37 | reset/3, 38 | truncate/3, 39 | trim/3, 40 | flush/1 41 | ]). 42 | 43 | -include_lib("wa_raft/include/wa_raft.hrl"). 44 | 45 | -type state() :: undefined. 46 | 47 | %%------------------------------------------------------------------- 48 | %% RAFT log provider interface for accessing log data 49 | %%------------------------------------------------------------------- 50 | 51 | -spec first_index(Log :: wa_raft_log:log()) -> undefined | wa_raft_log:log_index(). 52 | first_index(#raft_log{name = Name}) -> 53 | case ets:first(Name) of 54 | '$end_of_table' -> undefined; 55 | Key -> Key 56 | end. 57 | 58 | -spec last_index(Log :: wa_raft_log:log()) -> undefined | wa_raft_log:log_index(). 59 | last_index(#raft_log{name = Name}) -> 60 | case ets:last(Name) of 61 | '$end_of_table' -> undefined; 62 | Key -> Key 63 | end. 64 | 65 | -spec fold(Log :: wa_raft_log:log(), 66 | Start :: wa_raft_log:log_index() | '$end_of_table', 67 | End :: wa_raft_log:log_index(), 68 | SizeLimit :: non_neg_integer() | infinity, 69 | Func :: fun((Index :: wa_raft_log:log_index(), Size :: non_neg_integer(), Entry :: wa_raft_log:log_entry(), Acc) -> Acc), 70 | Acc) -> {ok, Acc}. 71 | fold(Log, Start, End, SizeLimit, Func, Acc) -> 72 | fold_impl(Log, Start, End, 0, SizeLimit, Func, Acc). 73 | 74 | -spec fold_impl( 75 | Log :: wa_raft_log:log(), 76 | Start :: wa_raft_log:log_index() | '$end_of_table', 77 | End :: wa_raft_log:log_index(), 78 | Size :: non_neg_integer(), 79 | SizeLimit :: non_neg_integer() | infinity, 80 | Func :: fun((Index :: wa_raft_log:log_index(), Size :: non_neg_integer(), Entry :: wa_raft_log:log_entry(), Acc) -> Acc), 81 | Acc 82 | ) -> {ok, Acc}. 83 | fold_impl(_Log, Start, End, Size, SizeLimit, _Func, Acc) when End < Start orelse Size >= SizeLimit -> 84 | {ok, Acc}; 85 | fold_impl(#raft_log{name = Name} = Log, Start, End, Size, SizeLimit, Func, Acc) -> 86 | case ets:lookup(Name, Start) of 87 | [{Start, Entry}] -> 88 | EntrySize = erlang:external_size(Entry), 89 | fold_impl(Log, ets:next(Name, Start), End, Size + EntrySize, SizeLimit, Func, Func(Start, EntrySize, Entry, Acc)); 90 | [] -> 91 | fold_impl(Log, ets:next(Name, Start), End, Size, SizeLimit, Func, Acc) 92 | end. 93 | 94 | -spec fold_terms(Log :: wa_raft_log:log(), 95 | Start :: wa_raft_log:log_index() | '$end_of_table', 96 | End :: wa_raft_log:log_index(), 97 | Func :: fun((Index :: wa_raft_log:log_index(), Entry :: wa_raft_log:log_term(), Acc) -> Acc), 98 | Acc) -> {ok, Acc}. 99 | fold_terms(Log, Start, End, Func, Acc) -> 100 | fold_terms_impl(Log, Start, End, Func, Acc). 101 | 102 | -spec fold_terms_impl( 103 | Log :: wa_raft_log:log(), 104 | Start :: wa_raft_log:log_index() | '$end_of_table', 105 | End :: wa_raft_log:log_index(), 106 | Func :: fun((Index :: wa_raft_log:log_index(), Term :: wa_raft_log:log_term(), Acc) -> Acc), 107 | Acc 108 | ) -> {ok, Acc}. 109 | fold_terms_impl(_Log, Start, End, _Func, Acc) when End < Start -> 110 | {ok, Acc}; 111 | fold_terms_impl(#raft_log{name = Name} = Log, Start, End, Func, Acc) -> 112 | case ets:lookup(Name, Start) of 113 | [{Start, {Term, _Op}}] -> 114 | fold_terms_impl(Log, ets:next(Name, Start), End, Func, Func(Start, Term, Acc)); 115 | [] -> 116 | fold_terms_impl(Log, ets:next(Name, Start), End, Func, Acc) 117 | end. 118 | 119 | -spec get(Log :: wa_raft_log:log(), Index :: wa_raft_log:log_index()) -> {ok, Entry :: wa_raft_log:log_entry()} | not_found. 120 | get(#raft_log{name = Name}, Index) -> 121 | case ets:lookup(Name, Index) of 122 | [{Index, Entry}] -> {ok, Entry}; 123 | [] -> not_found 124 | end. 125 | 126 | -spec term(Log :: wa_raft_log:log(), Index :: wa_raft_log:log_index()) -> {ok, Term :: wa_raft_log:log_term()} | not_found. 127 | term(Log, Index) -> 128 | case get(Log, Index) of 129 | {ok, {Term, _Op}} -> {ok, Term}; 130 | not_found -> not_found 131 | end. 132 | 133 | -spec config(Log :: wa_raft_log:log()) -> {ok, Index :: wa_raft_log:log_index(), Entry :: wa_raft_server:config()} | not_found. 134 | config(#raft_log{name = Name}) -> 135 | case ets:select_reverse(Name, [{{'$1', {'_', {'_', {config, '$2'}}}}, [], [{{'$1', '$2'}}]}], 1) of 136 | {[{Index, Config}], _Cont} -> {ok, Index, Config}; 137 | _ -> not_found 138 | end. 139 | 140 | %%------------------------------------------------------------------- 141 | %% RAFT log provider interface for writing new log data 142 | %%------------------------------------------------------------------- 143 | 144 | -spec append(View :: wa_raft_log:view(), Entries :: [wa_raft_log:log_entry() | binary()], Mode :: strict | relaxed, Priority :: wa_raft_acceptor:priority()) -> ok. 145 | append(View, Entries, _Mode, _Priority) -> 146 | Name = wa_raft_log:log_name(View), 147 | Last = wa_raft_log:last_index(View), 148 | true = ets:insert(Name, append_decode(Last + 1, Entries)), 149 | ok. 150 | 151 | -spec append_decode(Index :: wa_raft_log:log_index(), Entries :: [wa_raft_log:log_entry() | binary()]) -> 152 | [{wa_raft_log:log_index(), wa_raft_log:log_entry()}]. 153 | append_decode(_, []) -> 154 | []; 155 | append_decode(Index, [Entry | Entries]) -> 156 | NewEntry = 157 | case is_binary(Entry) of 158 | true -> binary_to_term(Entry); 159 | false -> Entry 160 | end, 161 | [{Index, NewEntry} | append_decode(Index + 1, Entries)]. 162 | 163 | %%------------------------------------------------------------------- 164 | %% RAFT log provider interface for managing underlying RAFT log 165 | %%------------------------------------------------------------------- 166 | 167 | -spec init(Log :: wa_raft_log:log()) -> ok. 168 | init(#raft_log{name = LogName}) -> 169 | LogName = ets:new(LogName, [ordered_set, public, named_table]), 170 | ok. 171 | 172 | -spec open(Log :: wa_raft_log:log()) -> {ok, State :: state()}. 173 | open(_Log) -> 174 | {ok, undefined}. 175 | 176 | -spec close(Log :: wa_raft_log:log(), State :: state()) -> term(). 177 | close(_Log, _State) -> 178 | ok. 179 | 180 | -spec reset(Log :: wa_raft_log:log(), Position :: wa_raft_log:log_pos(), State :: state()) -> 181 | {ok, NewState :: state()}. 182 | reset(#raft_log{name = Name}, #raft_log_pos{index = Index, term = Term}, State) -> 183 | true = ets:delete_all_objects(Name), 184 | true = ets:insert(Name, {Index, {Term, undefined}}), 185 | {ok, State}. 186 | 187 | -spec truncate(Log :: wa_raft_log:log(), Index :: wa_raft_log:log_index() | '$end_of_table', State :: state()) -> 188 | {ok, NewState :: state()}. 189 | truncate(_Log, '$end_of_table', State) -> 190 | {ok, State}; 191 | truncate(#raft_log{name = Name} = Log, Index, State) -> 192 | true = ets:delete(Name, Index), 193 | truncate(Log, ets:next(Name, Index), State). 194 | 195 | -spec trim(Log :: wa_raft_log:log(), Index :: wa_raft_log:log_index(), State :: state()) -> 196 | {ok, NewState :: state()}. 197 | trim(Log, Index, State) -> 198 | trim_impl(Log, Index - 1), 199 | {ok, State}. 200 | 201 | -spec trim_impl(Log :: wa_raft_log:log(), Index :: wa_raft_log:log_index() | '$end_of_table') -> ok. 202 | trim_impl(_Log, '$end_of_table') -> 203 | ok; 204 | trim_impl(#raft_log{name = Name} = Log, Index) -> 205 | true = ets:delete(Name, Index), 206 | trim_impl(Log, ets:prev(Name, Index)). 207 | 208 | -spec flush(Log :: wa_raft_log:log()) -> term(). 209 | flush(_Log) -> 210 | ok. 211 | -------------------------------------------------------------------------------- /src/wa_raft_storage_ets.erl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | %%% 6 | %%% An example implementation of the RAFT storage provider behaviour that 7 | %%% uses ETS as a backing store. This implementation is for demonstration 8 | %%% purposes only and should not be used in actual applications. 9 | 10 | -module(wa_raft_storage_ets). 11 | -compile(warn_missing_spec_all). 12 | -behaviour(wa_raft_storage). 13 | 14 | -export([ 15 | storage_open/2, 16 | storage_close/1, 17 | storage_label/1, 18 | storage_position/1, 19 | storage_config/1, 20 | storage_apply/3, 21 | storage_apply/4, 22 | storage_apply_config/3, 23 | storage_read/3, 24 | storage_create_snapshot/2, 25 | storage_create_witness_snapshot/2, 26 | storage_open_snapshot/3, 27 | storage_make_empty_snapshot/5 28 | ]). 29 | 30 | -include_lib("wa_raft/include/wa_raft.hrl"). 31 | 32 | %% Options used for the ETS table 33 | -define(OPTIONS, [set, public, {read_concurrency, true}, {write_concurrency, true}]). 34 | 35 | %% Filename used for the actual ETS table file in a snapshot 36 | -define(SNAPSHOT_FILENAME, "data"). 37 | 38 | %% Tag used in keys for metadata stored on the behalf of RAFT 39 | -define(METADATA_TAG, '$metadata'). 40 | %% Tag used for label metadata stored on behalf of RAFT. 41 | -define(LABEL_TAG, '$label'). 42 | %% Tag used for recording the current storage position 43 | -define(POSITION_TAG, '$position'). 44 | %% Tag used for tracking if the current storage is incomplete. 45 | -define(INCOMPLETE_TAG, '$incomplete'). 46 | 47 | -record(state, { 48 | name :: atom(), 49 | table :: wa_raft:table(), 50 | partition :: wa_raft:partition(), 51 | self :: #raft_identity{}, 52 | storage :: ets:table() 53 | }). 54 | 55 | -spec storage_open(#raft_options{}, file:filename()) -> #state{}. 56 | storage_open(#raft_options{table = Table, partition = Partition, self = Self, storage_name = Name}, _RootDir) -> 57 | Storage = ets:new(Name, ?OPTIONS), 58 | #state{name = Name, table = Table, partition = Partition, self = Self, storage = Storage}. 59 | 60 | -spec storage_close(#state{}) -> ok. 61 | storage_close(#state{storage = Storage}) -> 62 | true = ets:delete(Storage), 63 | ok. 64 | 65 | -spec storage_position(#state{}) -> wa_raft_log:log_pos(). 66 | storage_position(#state{storage = Storage}) -> 67 | ets:lookup_element(Storage, ?POSITION_TAG, 2, #raft_log_pos{}). 68 | 69 | -spec storage_label(#state{}) -> {ok, Label :: wa_raft_label:label()}. 70 | storage_label(#state{storage = Storage}) -> 71 | case ets:lookup(Storage, ?LABEL_TAG) of 72 | [{_, Label}] -> {ok, Label}; 73 | [] -> {ok, undefined} 74 | end. 75 | 76 | -spec storage_config(#state{}) -> {ok, wa_raft_log:log_pos(), wa_raft_server:config()} | undefined. 77 | storage_config(#state{storage = Storage}) -> 78 | case ets:lookup(Storage, {?METADATA_TAG, config}) of 79 | [{_, {Version, Value}}] -> {ok, Version, Value}; 80 | [] -> undefined 81 | end. 82 | 83 | -spec storage_incomplete(#state{}) -> boolean(). 84 | storage_incomplete(#state{storage = Storage}) -> 85 | ets:lookup_element(Storage, ?INCOMPLETE_TAG, 2, false). 86 | 87 | -spec storage_apply(Command :: wa_raft_acceptor:command(), Position :: wa_raft_log:log_pos(), Label :: wa_raft_label:label(), Storage :: #state{}) -> {ok, #state{}}. 88 | storage_apply(Command, Position, Label, #state{storage = Storage} = State) -> 89 | true = ets:insert(Storage, {?LABEL_TAG, Label}), 90 | storage_apply(Command, Position, State). 91 | 92 | -spec storage_apply(Command :: wa_raft_acceptor:command(), Position :: wa_raft_log:log_pos(), Storage :: #state{}) -> {ok, #state{}}. 93 | storage_apply(noop, Position, #state{storage = Storage} = State) -> 94 | true = ets:insert(Storage, {?POSITION_TAG, Position}), 95 | {ok, State}; 96 | storage_apply(noop_omitted, Position, #state{storage = Storage} = State) -> 97 | true = ets:insert(Storage, [{?INCOMPLETE_TAG, true}, {?POSITION_TAG, Position}]), 98 | {ok, State}; 99 | storage_apply({write, _Table, Key, Value}, Position, #state{storage = Storage} = State) -> 100 | true = ets:insert(Storage, [{Key, Value}, {?POSITION_TAG, Position}]), 101 | {ok, State}; 102 | storage_apply({delete, _Table, Key}, Position, #state{storage = Storage} = State) -> 103 | true = ets:delete(Storage, Key), 104 | true = ets:insert(Storage, {?POSITION_TAG, Position}), 105 | {ok, State}. 106 | 107 | -spec storage_apply_config( 108 | Config :: wa_raft_server:config(), 109 | LogPos :: wa_raft_log:log_pos(), 110 | State :: #state{} 111 | ) -> {ok | {error, Reason :: term()}, #state{}}. 112 | storage_apply_config(Config, LogPos, State) -> 113 | storage_check_config(Config, State), 114 | storage_apply_config(Config, LogPos, LogPos, State). 115 | 116 | -spec storage_apply_config( 117 | Config :: wa_raft_server:config(), 118 | ConfigPos :: wa_raft_log:log_pos(), 119 | LogPos :: wa_raft_log:log_pos(), 120 | State :: #state{} 121 | ) -> {ok | {error, Reason :: term()}, #state{}}. 122 | storage_apply_config(Config, ConfigPos, LogPos, #state{storage = Storage} = State) -> 123 | true = ets:insert(Storage, [{{?METADATA_TAG, config}, {ConfigPos, Config}}, {?POSITION_TAG, LogPos}]), 124 | {ok, State}. 125 | 126 | -spec storage_read(Command :: wa_raft_acceptor:command(), Position :: wa_raft_log:log_pos(), State :: #state{}) -> ok | {ok, Value :: dynamic()} | not_found. 127 | storage_read(noop, _Position, #state{}) -> 128 | ok; 129 | storage_read({read, _Table, Key}, _Position, #state{storage = Storage}) -> 130 | case ets:lookup(Storage, Key) of 131 | [{_, Value}] -> {ok, Value}; 132 | [] -> not_found 133 | end. 134 | 135 | -spec storage_create_snapshot(file:filename(), #state{}) -> ok | {error, Reason :: term()}. 136 | storage_create_snapshot(SnapshotPath, #state{storage = Storage}) -> 137 | case filelib:ensure_path(SnapshotPath) of 138 | ok -> ets:tab2file(Storage, filename:join(SnapshotPath, ?SNAPSHOT_FILENAME)); 139 | {error, Reason} -> {error, Reason} 140 | end. 141 | 142 | -spec storage_create_witness_snapshot(file:filename(), #state{}) -> ok | {error, Reason :: term()}. 143 | storage_create_witness_snapshot(SnapshotPath, #state{name = Name, table = Table, partition = Partition, self = Self} = State) -> 144 | {ok, ConfigPosition, Config} = storage_config(State), 145 | SnapshotPosition = storage_position(State), 146 | storage_make_empty_snapshot(Name, Table, Partition, Self, SnapshotPath, SnapshotPosition, Config, ConfigPosition, #{}). 147 | 148 | -spec storage_open_snapshot(file:filename(), wa_raft_log:log_pos(), #state{}) -> {ok, #state{}} | {error, Reason :: term()}. 149 | storage_open_snapshot(SnapshotPath, SnapshotPosition, #state{storage = Storage} = State) -> 150 | SnapshotData = filename:join(SnapshotPath, ?SNAPSHOT_FILENAME), 151 | case ets:file2tab(SnapshotData) of 152 | {ok, NewStorage} -> 153 | case ets:lookup_element(NewStorage, ?POSITION_TAG, 2, #raft_log_pos{}) of 154 | SnapshotPosition -> 155 | NewState = State#state{storage = NewStorage}, 156 | storage_check_config(NewState), 157 | catch ets:delete(Storage), 158 | {ok, NewState}; 159 | _IncorrectPosition -> 160 | catch ets:delete(NewStorage), 161 | {error, bad_position} 162 | end; 163 | {error, Reason} -> 164 | {error, Reason} 165 | end. 166 | 167 | -spec storage_check_config(#state{}) -> ok. 168 | storage_check_config(State) -> 169 | case storage_config(State) of 170 | {ok, _, Config} -> storage_check_config(Config, State); 171 | undefined -> ok 172 | end. 173 | 174 | -spec storage_check_config(wa_raft_server:config(), #state{}) -> ok. 175 | storage_check_config(Config, #state{self = Self} = State) -> 176 | case storage_incomplete(State) andalso wa_raft_server:is_data_replica(Self, Config) of 177 | true -> error(invalid_incomplete_replica); 178 | false -> ok 179 | end. 180 | 181 | -spec storage_make_empty_snapshot(#raft_options{}, file:filename(), wa_raft_log:log_pos(), wa_raft_server:config(), dynamic()) -> ok | {error, Reason :: term()}. 182 | storage_make_empty_snapshot(#raft_options{table = Table, partition = Partition, self = Self, storage_name = Name}, SnapshotPath, SnapshotPosition, Config, Data) -> 183 | storage_make_empty_snapshot(Name, Table, Partition, Self, SnapshotPath, SnapshotPosition, Config, SnapshotPosition, Data). 184 | 185 | -spec storage_make_empty_snapshot(atom(), wa_raft:table(), wa_raft:partition(), #raft_identity{}, file:filename(), wa_raft_log:log_pos(), wa_raft_server:config(), wa_raft_log:log_pos(), dynamic()) -> ok | {error, Reason :: term()}. 186 | storage_make_empty_snapshot(Name, Table, Partition, Self, SnapshotPath, SnapshotPosition, Config, ConfigPosition, _Data) -> 187 | Storage = ets:new(Name, ?OPTIONS), 188 | State = #state{name = Name, table = Table, partition = Partition, self = Self, storage = Storage}, 189 | storage_apply_config(Config, ConfigPosition, SnapshotPosition, State), 190 | storage_create_snapshot(SnapshotPath, State). 191 | -------------------------------------------------------------------------------- /src/wa_raft_part_sup.erl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | %%% 6 | %%% OTP Supervisor for monitoring RAFT processes. Correctness of RAFT 7 | %%% relies on the consistency of the signaling between the processes, 8 | %%% this supervisor is configured to restart all RAFT processes 9 | %%% when any of them exits abnormally. 10 | 11 | -module(wa_raft_part_sup). 12 | -compile(warn_missing_spec_all). 13 | -behaviour(supervisor). 14 | 15 | %% OTP Supervision 16 | -export([ 17 | child_spec/1, 18 | child_spec/2, 19 | start_link/2 20 | ]). 21 | 22 | %% Internal API 23 | -export([ 24 | default_name/2, 25 | default_partition_path/3, 26 | registered_name/2, 27 | registered_partition_path/2 28 | ]). 29 | 30 | %% Internal API 31 | -export([ 32 | options/2 33 | ]). 34 | 35 | %% Supervisor callbacks 36 | -export([ 37 | init/1 38 | ]). 39 | 40 | %% Test API 41 | -export([ 42 | prepare_spec/2 43 | ]). 44 | 45 | -include_lib("wa_raft/include/wa_raft.hrl"). 46 | -include_lib("wa_raft/include/wa_raft_logger.hrl"). 47 | 48 | %% Key in persistent_term for the options associated with a RAFT partition. 49 | -define(OPTIONS_KEY(Table, Partition), {?MODULE, Table, Partition}). 50 | 51 | %%------------------------------------------------------------------- 52 | %% OTP supervision 53 | %%------------------------------------------------------------------- 54 | 55 | %% Returns a spec suitable for use with a `simple_one_for_one` supervisor. 56 | -spec child_spec(Application :: atom()) -> supervisor:child_spec(). 57 | child_spec(Application) -> 58 | #{ 59 | id => ?MODULE, 60 | start => {?MODULE, start_link, [Application]}, 61 | restart => permanent, 62 | shutdown => infinity, 63 | type => supervisor, 64 | modules => [?MODULE] 65 | }. 66 | 67 | -spec child_spec(Application :: atom(), Spec :: wa_raft:args()) -> supervisor:child_spec(). 68 | child_spec(Application, Spec) -> 69 | #{ 70 | id => ?MODULE, 71 | start => {?MODULE, start_link, [Application, Spec]}, 72 | restart => permanent, 73 | shutdown => infinity, 74 | type => supervisor, 75 | modules => [?MODULE] 76 | }. 77 | 78 | -spec start_link(Application :: atom(), Spec :: wa_raft:args()) -> supervisor:startlink_ret(). 79 | start_link(Application, Spec) -> 80 | %% First normalize the provided specification into a full options record. 81 | Options = #raft_options{table = Table, partition = Partition, supervisor_name = Name} = normalize_spec(Application, Spec), 82 | 83 | %% Then put the declared options for the current RAFT partition into 84 | %% persistent term for access by shared resources and other services. 85 | %% The RAFT options for a table are not expected to change during the 86 | %% runtime of the RAFT application and so repeated updates should not 87 | %% result in any GC load. Warn if this is case. 88 | PrevOptions = persistent_term:get(?OPTIONS_KEY(Table, Partition), Options), 89 | PrevOptions =/= Options andalso 90 | ?RAFT_LOG_WARNING( 91 | ?MODULE_STRING " storing changed options for RAFT partitition ~0p/~0p", 92 | [Table, Partition] 93 | ), 94 | ok = persistent_term:put(?OPTIONS_KEY(Table, Partition), Options), 95 | 96 | supervisor:start_link({local, Name}, ?MODULE, Options). 97 | 98 | %%------------------------------------------------------------------- 99 | %% Internal API 100 | %%------------------------------------------------------------------- 101 | 102 | %% Get the default name for the RAFT partition supervisor associated with the 103 | %% provided RAFT partition. 104 | -spec default_name(Table :: wa_raft:table(), Partition :: wa_raft:partition()) -> Name :: atom(). 105 | default_name(Table, Partition) -> 106 | list_to_atom("raft_sup_" ++ atom_to_list(Table) ++ "_" ++ integer_to_list(Partition)). 107 | 108 | %% Get the default location for the database directory associated with the 109 | %% provided RAFT partition given the database of the RAFT root. 110 | -spec default_partition_path(Root :: file:filename(), Table :: wa_raft:table(), Partition :: wa_raft:partition()) -> Database :: file:filename(). 111 | default_partition_path(Root, Table, Partition) -> 112 | filename:join(Root, atom_to_list(Table) ++ "." ++ integer_to_list(Partition)). 113 | 114 | %% Get the registered name for the RAFT partition supervisor associated with the 115 | %% provided RAFT partition or the default name if no registration exists. 116 | -spec registered_name(Table :: wa_raft:table(), Partition :: wa_raft:partition()) -> Name :: atom(). 117 | registered_name(Table, Partition) -> 118 | case wa_raft_part_sup:options(Table, Partition) of 119 | undefined -> default_name(Table, Partition); 120 | Options -> Options#raft_options.supervisor_name 121 | end. 122 | 123 | %% Get the registered database directory for the provided RAFT partition. An 124 | %% error is raised if no registration exists. 125 | -spec registered_partition_path(Table :: wa_raft:table(), Partition :: wa_raft:partition()) -> Database :: file:filename(). 126 | registered_partition_path(Table, Partition) -> 127 | case wa_raft_part_sup:options(Table, Partition) of 128 | undefined -> error({not_registered, Table, Partition}); 129 | Options -> Options#raft_options.database 130 | end. 131 | 132 | -spec options(Table :: wa_raft:table(), Partition :: wa_raft:partition()) -> #raft_options{} | undefined. 133 | options(Table, Partition) -> 134 | persistent_term:get(?OPTIONS_KEY(Table, Partition), undefined). 135 | 136 | -spec normalize_spec(Application :: atom(), Spec :: wa_raft:args()) -> #raft_options{}. 137 | normalize_spec(Application, #{table := Table, partition := Partition} = Spec) -> 138 | Root = wa_raft_env:database_path(Application), 139 | Database = default_partition_path(Root, Table, Partition), 140 | ServerName = wa_raft_server:default_name(Table, Partition), 141 | LogName = wa_raft_log:default_name(Table, Partition), 142 | StorageName = wa_raft_storage:default_name(Table, Partition), 143 | AcceptorName = wa_raft_acceptor:default_name(Table, Partition), 144 | #raft_options{ 145 | application = Application, 146 | table = Table, 147 | partition = Partition, 148 | % RAFT identity always uses the default RAFT server name for the partition 149 | self = #raft_identity{name = wa_raft_server:default_name(Table, Partition), node = node()}, 150 | identifier = #raft_identifier{application = Application, table = Table, partition = Partition}, 151 | database = Database, 152 | acceptor_name = AcceptorName, 153 | distribution_module = maps:get(distribution_module, Spec, wa_raft_env:get_env(Application, raft_distribution_module, ?RAFT_DEFAULT_DISTRIBUTION_MODULE)), 154 | log_name = LogName, 155 | log_module = maps:get(log_module, Spec, wa_raft_env:get_env(Application, raft_log_module, ?RAFT_DEFAULT_LOG_MODULE)), 156 | label_module = maps:get(label_module, Spec, wa_raft_env:get_env(Application, raft_label_module, ?RAFT_DEFAULT_LABEL_MODULE)), 157 | log_catchup_name = wa_raft_log_catchup:default_name(Table, Partition), 158 | queue_name = wa_raft_queue:default_name(Table, Partition), 159 | queue_counters = wa_raft_queue:default_counters(), 160 | queue_reads = wa_raft_queue:default_read_queue_name(Table, Partition), 161 | server_name = ServerName, 162 | storage_name = StorageName, 163 | storage_module = maps:get(storage_module, Spec, wa_raft_env:get_env(Application, raft_storage_module, ?RAFT_DEFAULT_STORAGE_MODULE)), 164 | supervisor_name = default_name(Table, Partition), 165 | transport_cleanup_name = wa_raft_transport_cleanup:default_name(Table, Partition), 166 | transport_directory = wa_raft_transport:default_directory(Database), 167 | transport_module = maps:get(transport_module, Spec, wa_raft_env:get_env(Application, {raft_transport_module, transport_module}, ?RAFT_DEFAULT_TRANSPORT_MODULE)) 168 | }. 169 | 170 | %%------------------------------------------------------------------- 171 | %% Test API 172 | %%------------------------------------------------------------------- 173 | 174 | -spec prepare_spec(Application :: atom(), Spec :: wa_raft:args()) -> #raft_options{}. 175 | prepare_spec(Application, Spec) -> 176 | Options = #raft_options{table = Table, partition = Partition} = normalize_spec(Application, Spec), 177 | ok = persistent_term:put(?OPTIONS_KEY(Table, Partition), Options), 178 | Options. 179 | 180 | %%------------------------------------------------------------------- 181 | %% Supervisor callbacks 182 | %%------------------------------------------------------------------- 183 | 184 | -spec init(Options :: #raft_options{}) -> {ok, {supervisor:sup_flags(), [supervisor:child_spec()]}}. 185 | init(Options) -> 186 | ChildSpecs = [ 187 | wa_raft_queue:child_spec(Options), 188 | wa_raft_storage:child_spec(Options), 189 | wa_raft_log:child_spec(Options), 190 | wa_raft_log_catchup:child_spec(Options), 191 | wa_raft_server:child_spec(Options), 192 | wa_raft_acceptor:child_spec(Options), 193 | wa_raft_transport_cleanup:child_spec(Options) 194 | ], 195 | {ok, {#{strategy => one_for_all, intensity => 10, period => 1}, ChildSpecs}}. 196 | -------------------------------------------------------------------------------- /src/wa_raft_dist_transport.erl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | %%% 6 | %%% This module implements transport interface by using erlang OTP dist. 7 | 8 | -module(wa_raft_dist_transport). 9 | -compile(warn_missing_spec_all). 10 | -behaviour(gen_server). 11 | -behaviour(wa_raft_transport). 12 | 13 | -export([ 14 | child_spec/0, 15 | start_link/0 16 | ]). 17 | 18 | -export([ 19 | transport_init/1, 20 | transport_send/3 21 | ]). 22 | 23 | -export([ 24 | init/1, 25 | handle_call/3, 26 | handle_cast/2, 27 | terminate/2 28 | ]). 29 | 30 | -include_lib("wa_raft/include/wa_raft.hrl"). 31 | -include_lib("wa_raft/include/wa_raft_logger.hrl"). 32 | 33 | -record(sender_state, { 34 | }). 35 | -record(receiver_state, { 36 | fds = #{} :: #{{ID :: wa_raft_transport:transport_id(), FileID :: wa_raft_transport:file_id()} => Fd :: file:fd()} 37 | }). 38 | 39 | -spec child_spec() -> supervisor:child_spec(). 40 | child_spec() -> 41 | #{ 42 | id => ?MODULE, 43 | start => {?MODULE, start_link, []}, 44 | restart => transient, 45 | shutdown => 5000, 46 | modules => [?MODULE] 47 | }. 48 | 49 | -spec start_link() -> gen_server:start_ret(). 50 | start_link() -> 51 | gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). 52 | 53 | -spec transport_init(Node :: node()) -> {ok, State :: #sender_state{}}. 54 | transport_init(_Node) -> 55 | {ok, #sender_state{}}. 56 | 57 | -spec transport_send(ID :: wa_raft_transport:transport_id(), FileID :: wa_raft_transport:file_id(), State :: #sender_state{}) -> 58 | {ok, NewState :: #sender_state{}} | 59 | {stop, Reason :: term(), NewState :: #sender_state{}}. 60 | transport_send(ID, FileID, State) -> 61 | ?RAFT_LOG_DEBUG("wa_raft_dist_transport starting to send file ~p/~p", [ID, FileID]), 62 | case wa_raft_transport:transport_info(ID) of 63 | {ok, #{peer := Peer}} -> 64 | case wa_raft_transport:file_info(ID, FileID) of 65 | {ok, #{name := File, path := Path}} -> 66 | case prim_file:open(Path, [binary, read]) of 67 | {ok, Fd} -> 68 | try 69 | catch prim_file:advise(Fd, 0, 0, sequential), 70 | case transport_send_loop(ID, FileID, Fd, Peer, State) of 71 | {ok, NewState} -> 72 | {ok, NewState}; 73 | {error, Reason, NewState} -> 74 | {stop, Reason, NewState} 75 | end 76 | after 77 | prim_file:close(Fd) 78 | end; 79 | {error, Reason} -> 80 | ?RAFT_LOG_ERROR( 81 | "wa_raft_dist_transport failed to open file ~p/~p (~s) due to ~p", 82 | [ID, FileID, File, Reason] 83 | ), 84 | {stop, {failed_to_open_file, ID, FileID, Reason}, State} 85 | end; 86 | _ -> 87 | {stop, {invalid_file, ID, FileID}, State} 88 | end; 89 | _ -> 90 | {stop, {invalid_transport, ID}, State} 91 | end. 92 | 93 | -spec transport_send_loop( 94 | wa_raft_transport:transport_id(), 95 | wa_raft_transport:file_id(), 96 | file:fd(), 97 | node(), 98 | #sender_state{} 99 | ) -> {ok, #sender_state{}} | {error, term(), #sender_state{}}. 100 | transport_send_loop(ID, FileID, Fd, Peer, State) -> 101 | ChunkSize = ?RAFT_DIST_TRANSPORT_CHUNK_SIZE(), 102 | MaxInflight = ?RAFT_DIST_TRANSPORT_MAX_INFLIGHT(), 103 | transport_send_loop(ID, FileID, Fd, 0, Peer, [], ChunkSize, MaxInflight, State). 104 | 105 | -spec transport_send_loop( 106 | wa_raft_transport:transport_id(), 107 | wa_raft_transport:file_id(), 108 | file:fd(), 109 | non_neg_integer() | eof, 110 | node(), 111 | [gen_server:request_id()], 112 | pos_integer(), 113 | pos_integer(), 114 | #sender_state{} 115 | ) -> {ok, #sender_state{}} | {error, term(), #sender_state{}}. 116 | transport_send_loop(ID, FileID, _Fd, eof, Peer, [], _ChunkSize, _MaxInflight, State) -> 117 | gen_server:cast({?MODULE, Peer}, {complete, ID, FileID}), 118 | {ok, State}; 119 | transport_send_loop(ID, FileID, Fd, Offset, Peer, [RequestId | Chunks], ChunkSize, MaxInflight, State) 120 | when Offset =:= eof orelse length(Chunks) >= MaxInflight -> 121 | case gen_server:wait_response(RequestId, 5000) of 122 | {reply, ok} -> 123 | transport_send_loop(ID, FileID, Fd, Offset, Peer, Chunks, ChunkSize, MaxInflight, State); 124 | {reply, {error, Reason}} -> 125 | ?RAFT_LOG_ERROR("wa_raft_dist_transport failed to send file ~p/~p due to receiver error ~p", [ID, FileID, Reason]), 126 | {error, {receiver_error, ID, FileID, Reason}, State}; 127 | timeout -> 128 | ?RAFT_LOG_ERROR("wa_raft_dist_transport timed out while sending file ~p/~p", [ID, FileID]), 129 | {error, {send_timed_out, ID, FileID}, State}; 130 | {error, {Reason, _}} -> 131 | ?RAFT_LOG_ERROR("wa_raft_dist_transport failed to send file ~p/~p due to ~p", [ID, FileID, Reason]), 132 | {error, {send_failed, ID, FileID, Reason}, State} 133 | end; 134 | transport_send_loop(ID, FileID, Fd, Offset, Peer, Chunks, ChunkSize, MaxInflight, State) when is_integer(Offset) -> 135 | case prim_file:read(Fd, ChunkSize) of 136 | {ok, Data} -> 137 | RequestId = gen_server:send_request({?MODULE, Peer}, {chunk, ID, FileID, Offset, Data}), 138 | wa_raft_transport:update_file_info(ID, FileID, 139 | fun (#{completed_bytes := Completed} = Info) -> 140 | Info#{completed_bytes => Completed + byte_size(Data)} 141 | end), 142 | transport_send_loop(ID, FileID, Fd, Offset + byte_size(Data), Peer, Chunks ++ [RequestId], ChunkSize, MaxInflight, State); 143 | eof -> 144 | transport_send_loop(ID, FileID, Fd, eof, Peer, Chunks, ChunkSize, MaxInflight, State); 145 | {error, Reason} -> 146 | ?RAFT_LOG_ERROR("wa_raft_dist_transport failed to read file ~p/~p due to ~p", [ID, FileID, Reason]), 147 | {error, {read_failed, ID, FileID, Reason}, State} 148 | end. 149 | 150 | -spec init(Args :: list()) -> {ok, State :: #receiver_state{}}. 151 | init([]) -> 152 | process_flag(trap_exit, true), 153 | {ok, #receiver_state{}}. 154 | 155 | -spec handle_call(Request, From :: term(), State :: #receiver_state{}) -> 156 | {reply, Reply :: term(), NewState :: #receiver_state{}} | {noreply, NewState :: #receiver_state{}} 157 | when Request :: {chunk, wa_raft_transport:transport_id(), wa_raft_transport:file_id(), integer(), binary()}. 158 | handle_call({chunk, ID, FileID, Offset, Data}, _From, #receiver_state{} = State0) -> 159 | {Reply, NewState} = case open_file(ID, FileID, State0) of 160 | {ok, Fd, State1} -> 161 | case prim_file:pwrite(Fd, Offset, Data) of 162 | ok -> 163 | wa_raft_transport:update_file_info(ID, FileID, 164 | fun (#{completed_bytes := Completed} = Info) -> 165 | Info#{completed_bytes => Completed + byte_size(Data)} 166 | end), 167 | 168 | {ok, State1}; 169 | {error, Reason} -> 170 | ?RAFT_LOG_WARNING( 171 | "wa_raft_dist_transport receiver failed to write file chunk ~p/~p @ ~p due to ~p", 172 | [ID, FileID, Offset, Reason] 173 | ), 174 | {{write_failed, Reason}, State1} 175 | end; 176 | {error, Reason, State1} -> 177 | ?RAFT_LOG_WARNING( 178 | "wa_raft_dist_transport receiver failed to handle file chunk ~p/~p @ ~p due to open failing due to ~p", 179 | [ID, FileID, Offset, Reason] 180 | ), 181 | {{open_failed, Reason}, State1} 182 | end, 183 | {reply, Reply, NewState}; 184 | handle_call(Request, From, #receiver_state{} = State) -> 185 | ?RAFT_LOG_NOTICE("wa_raft_dist_transport got unrecognized call ~p from ~p", [Request, From]), 186 | {noreply, State}. 187 | 188 | -spec handle_cast(Request, State :: #receiver_state{}) -> {noreply, NewState :: #receiver_state{}} 189 | when Request :: {complete, wa_raft_transport:transport_id(), wa_raft_transport:file_id()}. 190 | handle_cast({complete, ID, FileID}, #receiver_state{} = State0) -> 191 | case open_file(ID, FileID, State0) of 192 | {ok, _Fd, State1} -> 193 | {ok, State2} = close_file(ID, FileID, State1), 194 | wa_raft_transport:complete(ID, FileID, ok), 195 | {noreply, State2}; 196 | {error, Reason, State1} -> 197 | ?RAFT_LOG_WARNING( 198 | "wa_raft_dist_transport receiver failed to handle file complete ~p/~p due to open failing due to ~p", 199 | [ID, FileID, Reason] 200 | ), 201 | {noreply, State1} 202 | end; 203 | handle_cast(Request, #receiver_state{} = State) -> 204 | ?RAFT_LOG_NOTICE("wa_raft_dist_transport got unrecognized cast ~p", [Request]), 205 | {noreply, State}. 206 | 207 | -spec terminate(Reason :: term(), State :: #receiver_state{}) -> term(). 208 | terminate(Reason, #receiver_state{}) -> 209 | ?RAFT_LOG_NOTICE("wa_raft_dist_transport terminating due to ~p", [Reason]), 210 | ok. 211 | 212 | -spec open_file(ID :: wa_raft_transport:transport_id(), FileID :: wa_raft_transport:file_id(), State :: #receiver_state{}) -> 213 | {ok, Fd :: file:fd(), NewState :: #receiver_state{}} | {error, Reason :: term(), NewState :: #receiver_state{}}. 214 | open_file(ID, FileID, #receiver_state{fds = Fds} = State0) -> 215 | case Fds of 216 | #{{ID, FileID} := Fd} -> 217 | {ok, Fd, State0}; 218 | #{} -> 219 | case wa_raft_transport:file_info(ID, FileID) of 220 | {ok, #{name := File, path := Path}} -> 221 | catch filelib:ensure_dir(Path), 222 | case prim_file:open(Path, [binary, write]) of 223 | {ok, Fd} -> 224 | State1 = State0#receiver_state{fds = Fds#{{ID, FileID} => Fd}}, 225 | {ok, Fd, State1}; 226 | {error, Reason} -> 227 | ?RAFT_LOG_WARNING( 228 | "wa_raft_dist_transport receiver failed to open file ~p/~p (~p) due to ~p", 229 | [ID, FileID, File, Reason] 230 | ), 231 | {error, {open_failed, Reason}, State0} 232 | end; 233 | _ -> 234 | {error, invalid_file, State0} 235 | end 236 | end. 237 | 238 | -spec close_file(ID :: wa_raft_transport:transport_id(), FileID :: wa_raft_transport:file_id(), State :: #receiver_state{}) -> 239 | {ok, NewState :: #receiver_state{}}. 240 | close_file(ID, FileID, #receiver_state{fds = Fds} = State0) -> 241 | case Fds of 242 | #{{ID, FileID} := Fd} -> 243 | catch prim_file:close(Fd), 244 | State1 = State0#receiver_state{fds = maps:remove({ID, FileID}, Fds)}, 245 | {ok, State1}; 246 | _ -> 247 | {ok, State0} 248 | end. 249 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /src/wa_raft_snapshot_catchup.erl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | %%% 6 | %%% This module manages requests to trigger snapshot catchup across all 7 | %%% local RAFT partitions. 8 | 9 | -module(wa_raft_snapshot_catchup). 10 | -compile(warn_missing_spec_all). 11 | -behaviour(gen_server). 12 | 13 | -include_lib("wa_raft/include/wa_raft.hrl"). 14 | -include_lib("wa_raft/include/wa_raft_logger.hrl"). 15 | 16 | % elp:ignore W0048 (no_dialyzer_attribute): improper list expected by gen interface 17 | -dialyzer({no_improper_lists, [handle_cast/2]}). 18 | 19 | %% Supervisor callbacks 20 | -export([ 21 | child_spec/0, 22 | start_link/0 23 | ]). 24 | 25 | %% Public API 26 | -export([ 27 | which_transports/0 28 | ]). 29 | 30 | %% Internal API 31 | -export([ 32 | catchup/6 33 | ]). 34 | 35 | %% Snapshot catchup server implementation 36 | -export([ 37 | init/1, 38 | handle_call/3, 39 | handle_cast/2, 40 | handle_info/2, 41 | terminate/2 42 | ]). 43 | 44 | %% Testing API 45 | -export([ 46 | init_tables/0 47 | ]). 48 | 49 | -define(SCAN_EVERY_MS, 500). 50 | 51 | -define(PENDING_KEY(Name, Node), {pending, Name, Node}). 52 | 53 | -define(WHICH_TRANSPORTS, which_transports). 54 | -define(CATCHUP(App, Name, Node, Table, Partition, Witness), {catchup, App, Name, Node, Table, Partition, Witness}). 55 | 56 | -type key() :: {Name :: atom(), Node :: node()}. 57 | -type snapshot_key() :: {Table :: wa_raft:table(), Partition :: wa_raft:partition(), Position :: wa_raft_log:log_pos(), Witness :: boolean()}. 58 | 59 | -type which_transports() :: ?WHICH_TRANSPORTS. 60 | -type call() :: which_transports(). 61 | 62 | -type catchup() :: ?CATCHUP(atom(), atom(), node(), wa_raft:table(), wa_raft:partition(), boolean()). 63 | -type cast() :: catchup(). 64 | 65 | -record(transport, { 66 | app :: atom(), 67 | table :: wa_raft:table(), 68 | partition :: wa_raft:partition(), 69 | id :: wa_raft_transport:transport_id(), 70 | snapshot :: {wa_raft_log:log_pos(), Witness :: boolean()} 71 | }). 72 | -record(state, { 73 | % currently active transports 74 | transports = #{} :: #{key() => #transport{}}, 75 | % counts of active transports that are using a particular snapshot 76 | snapshots = #{} :: #{snapshot_key() => pos_integer()}, 77 | % timestamps (ms) after which transports to previously overloaded nodes can be retried 78 | overload_backoffs = #{} :: #{node() => integer()}, 79 | % timestamps (ms) after which repeat transports can be retried 80 | retry_backoffs = #{} :: #{key() => integer()} 81 | }). 82 | 83 | -spec child_spec() -> supervisor:child_spec(). 84 | child_spec() -> 85 | #{ 86 | id => ?MODULE, 87 | start => {?MODULE, start_link, []}, 88 | restart => permanent, 89 | shutdown => 30000, 90 | modules => [?MODULE] 91 | }. 92 | 93 | -spec start_link() -> supervisor:startlink_ret(). 94 | start_link() -> 95 | gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). 96 | 97 | -spec which_transports() -> [wa_raft_transport:transport_id()]. 98 | which_transports() -> 99 | gen_server:call(?MODULE, ?WHICH_TRANSPORTS). 100 | 101 | -spec catchup( 102 | App :: atom(), 103 | Name :: atom(), 104 | Node :: node(), 105 | Table :: wa_raft:table(), 106 | Partition :: wa_raft:partition(), 107 | Witness :: boolean() 108 | ) -> ok. 109 | catchup(App, Name, Node, Table, Partition, Witness) -> 110 | try 111 | % Check ETS to avoid putting duplicate requests into the message queue. 112 | ets:insert_new(?MODULE, {?PENDING_KEY(Name, Node)}) andalso 113 | gen_server:cast(?MODULE, ?CATCHUP(App, Name, Node, Table, Partition, Witness)), 114 | ok 115 | catch 116 | error:badarg -> 117 | ok 118 | end. 119 | 120 | -spec init(Args :: term()) -> {ok, #state{}}. 121 | init([]) -> 122 | process_flag(trap_exit, true), 123 | init_tables(), 124 | schedule_scan(), 125 | {ok, #state{}}. 126 | 127 | -spec init_tables() -> ok. 128 | init_tables() -> 129 | ?MODULE = ets:new(?MODULE, [set, public, named_table]), 130 | ok. 131 | 132 | -spec handle_call(Request :: call(), From :: gen_server:from(), State :: #state{}) -> {noreply, #state{}} | {reply, term(), #state{}}. 133 | handle_call(?WHICH_TRANSPORTS, _From, #state{transports = Transports} = State) -> 134 | {reply, [ID || _ := #transport{id = ID} <- Transports], State}; 135 | handle_call(Request, From, #state{} = State) -> 136 | ?RAFT_LOG_NOTICE("received unrecognized call ~P from ~0p", [Request, 25, From]), 137 | {noreply, State}. 138 | 139 | -spec handle_cast(Request :: cast(), State :: #state{}) -> {noreply, #state{}}. 140 | handle_cast(?CATCHUP(App, Name, Node, Table, Partition, Witness), State0) -> 141 | % Just immediately remove the pending key from the ETS. Doing this here is simpler 142 | % but permits a bounded number of extra requests to remain in the queue. 143 | ets:delete(?MODULE, ?PENDING_KEY(Name, Node)), 144 | Now = erlang:monotonic_time(millisecond), 145 | case allowed(Now, Name, Node, State0) of 146 | {true, #state{transports = Transports, snapshots = Snapshots, overload_backoffs = OverloadBackoffs} = State1} -> 147 | try 148 | {#raft_log_pos{index = Index, term = Term} = LogPos, Path} = create_snapshot(Table, Partition, Witness), 149 | case wa_raft_transport:start_snapshot_transfer(Node, Table, Partition, LogPos, Path, Witness, infinity) of 150 | {error, receiver_overloaded} -> 151 | ?RAFT_LOG_NOTICE( 152 | "destination node ~0p is overloaded, abort new transport for ~0p:~0p and try again later", 153 | [Node, Table, Partition] 154 | ), 155 | NewOverloadBackoff = Now + ?RAFT_SNAPSHOT_CATCHUP_OVERLOADED_BACKOFF_MS(App), 156 | NewOverloadBackoffs = OverloadBackoffs#{Node => NewOverloadBackoff}, 157 | {noreply, State1#state{overload_backoffs = NewOverloadBackoffs}}; 158 | {ok, ID} -> 159 | ?RAFT_LOG_NOTICE( 160 | "started sending snapshot for ~0p:~0p at ~0p:~0p over transport ~0p", 161 | [Table, Partition, Index, Term, ID] 162 | ), 163 | NewTransport = #transport{ 164 | app = App, 165 | table = Table, 166 | partition = Partition, 167 | id = ID, 168 | snapshot = {LogPos, Witness} 169 | }, 170 | NewTransports = Transports#{{Name, Node} => NewTransport}, 171 | NewSnapshots = maps:update_with({Table, Partition, LogPos, Witness}, fun(V) -> V + 1 end, 1, Snapshots), 172 | {noreply, State1#state{transports = NewTransports, snapshots = NewSnapshots}} 173 | end 174 | catch 175 | _T:_E:S -> 176 | ?RAFT_LOG_ERROR( 177 | "failed to start accepted snapshot transport of ~0p:~0p to ~0p at ~p", 178 | [Table, Partition, Node, S] 179 | ), 180 | {noreply, State1} 181 | end; 182 | {false, State1} -> 183 | {noreply, State1} 184 | end; 185 | handle_cast(Request, #state{} = State) -> 186 | ?RAFT_LOG_NOTICE("received unrecognized cast ~P", [Request, 25]), 187 | {noreply, State}. 188 | 189 | -spec handle_info(term(), #state{}) -> {noreply, #state{}}. 190 | handle_info(scan, #state{transports = Transports} = State) -> 191 | NewState = maps:fold(fun scan_transport/3, State, Transports), 192 | schedule_scan(), 193 | {noreply, NewState}; 194 | handle_info(Info, #state{} = State) -> 195 | ?RAFT_LOG_NOTICE("received unrecognized info ~P", [Info, 25]), 196 | {noreply, State}. 197 | 198 | -spec terminate(Reason :: term(), #state{}) -> term(). 199 | terminate(_Reason, #state{transports = Transports, snapshots = Snapshots}) -> 200 | maps:foreach( 201 | fun ({_Name, _Node}, #transport{id = ID}) -> 202 | wa_raft_transport:cancel(ID, terminating) 203 | end, Transports), 204 | maps:foreach( 205 | fun ({Table, Partition, LogPos, Witness}, _) -> 206 | delete_snapshot(Table, Partition, LogPos, Witness) 207 | end, Snapshots). 208 | 209 | -spec allowed(Now :: integer(), Name :: atom(), Node :: node(), State :: #state{}) -> {boolean(), #state{}}. 210 | allowed(Now, Name, Node, #state{transports = Transports, overload_backoffs = OverloadBackoffs, retry_backoffs = RetryBackoffs} = State0) -> 211 | Key = {Name, Node}, 212 | Limited = maps:size(Transports) >= ?RAFT_MAX_CONCURRENT_SNAPSHOT_CATCHUP(), 213 | Exists = maps:is_key(Key, Transports), 214 | Overloaded = maps:get(Node, OverloadBackoffs, Now) > Now, 215 | Blocked = maps:get(Key, RetryBackoffs, Now) > Now, 216 | Allowed = not (Limited orelse Exists orelse Overloaded orelse Blocked), 217 | State1 = case Overloaded of 218 | true -> State0; 219 | false -> State0#state{overload_backoffs = maps:remove(Node, OverloadBackoffs)} 220 | end, 221 | State2 = case Blocked of 222 | true -> State1; 223 | false -> State1#state{retry_backoffs = maps:remove(Key, RetryBackoffs)} 224 | end, 225 | {Allowed, State2}. 226 | 227 | -spec scan_transport(Key :: key(), Transport :: #transport{}, #state{}) -> #state{}. 228 | scan_transport(Key, #transport{app = App, id = ID} = Transport, State) -> 229 | Status = case wa_raft_transport:transport_info(ID) of 230 | {ok, #{status := S}} -> S; 231 | _ -> undefined 232 | end, 233 | case Status of 234 | requested -> 235 | State; 236 | running -> 237 | State; 238 | completed -> 239 | finish_transport(Key, Transport, ?RAFT_SNAPSHOT_CATCHUP_COMPLETED_BACKOFF_MS(App), State); 240 | _Other -> 241 | finish_transport(Key, Transport, ?RAFT_SNAPSHOT_CATCHUP_FAILED_BACKOFF_MS(App), State) 242 | end. 243 | 244 | -spec finish_transport(Key :: key(), Transport :: #transport{}, Backoff :: pos_integer(), State :: #state{}) -> #state{}. 245 | finish_transport(Key, #transport{table = Table, partition = Partition, snapshot = {LogPos, Witness}}, Backoff, 246 | #state{transports = Transports, snapshots = Snapshots, retry_backoffs = RetryBackoffs} = State) -> 247 | Now = erlang:monotonic_time(millisecond), 248 | SnapshotKey = {Table, Partition, LogPos, Witness}, 249 | NewSnapshots = case Snapshots of 250 | #{SnapshotKey := 1} -> 251 | % try to delete a snapshot if it is the last transport using it 252 | delete_snapshot(Table, Partition, LogPos, Witness), 253 | maps:remove(SnapshotKey, Snapshots); 254 | #{SnapshotKey := Count} -> 255 | % otherwise decrement the reference count for the snapshot 256 | Snapshots#{SnapshotKey => Count - 1}; 257 | #{} -> 258 | % unexpected that the snapshot is missing, but just ignore 259 | Snapshots 260 | end, 261 | NewRetryBackoffs = RetryBackoffs#{Key => Now + Backoff}, 262 | State#state{transports = maps:remove(Key, Transports), snapshots = NewSnapshots, retry_backoffs = NewRetryBackoffs}. 263 | 264 | -spec delete_snapshot(Table :: wa_raft:table(), Partition :: wa_raft:partition(), 265 | Position :: wa_raft_log:log_pos(), Witness :: boolean()) -> ok. 266 | delete_snapshot(Table, Partition, Position, Witness) -> 267 | Storage = wa_raft_storage:registered_name(Table, Partition), 268 | wa_raft_storage:delete_snapshot(Storage, snapshot_name(Position, Witness)). 269 | 270 | -spec schedule_scan() -> reference(). 271 | schedule_scan() -> 272 | erlang:send_after(?SCAN_EVERY_MS, self(), scan). 273 | 274 | -spec snapshot_name(LogPos :: wa_raft_log:log_pos(), Witness :: boolean()) -> string(). 275 | snapshot_name(#raft_log_pos{index = Index, term = Term}, false) -> 276 | ?SNAPSHOT_NAME(Index, Term); 277 | snapshot_name(#raft_log_pos{index = Index, term = Term}, true) -> 278 | ?WITNESS_SNAPSHOT_NAME(Index, Term). 279 | 280 | -spec create_snapshot( 281 | Table :: wa_raft:table(), 282 | Partition :: wa_raft:partition(), 283 | Witness :: boolean() 284 | ) -> {LogPos :: wa_raft_log:log_pos(), Path :: string()}. 285 | create_snapshot(Table, Partition, Witness) -> 286 | StorageRef = wa_raft_storage:registered_name(Table, Partition), 287 | {ok, LogPos} = case Witness of 288 | false -> 289 | wa_raft_storage:create_snapshot(StorageRef); 290 | true -> 291 | wa_raft_storage:create_witness_snapshot(StorageRef) 292 | end, 293 | Path = ?RAFT_SNAPSHOT_PATH(Table, Partition, snapshot_name(LogPos, Witness)), 294 | {LogPos, Path}. 295 | -------------------------------------------------------------------------------- /src/wa_raft_acceptor.erl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | %%% 6 | %%% This module implements the front-end process for accepting commits / reads 7 | 8 | -module(wa_raft_acceptor). 9 | -compile(warn_missing_spec_all). 10 | -behaviour(gen_server). 11 | 12 | %% OTP supervisor 13 | -export([ 14 | child_spec/1, 15 | start_link/1 16 | ]). 17 | 18 | %% Client API - data access 19 | -export([ 20 | commit/2, 21 | commit/3, 22 | commit/4, 23 | commit_async/3, 24 | commit_async/4, 25 | read/2, 26 | read/3 27 | ]). 28 | 29 | %% Internal API 30 | -export([ 31 | default_name/2, 32 | registered_name/2 33 | ]). 34 | 35 | %% gen_server callbacks 36 | -export([ 37 | init/1, 38 | handle_call/3, 39 | handle_cast/2, 40 | terminate/2 41 | ]). 42 | 43 | -export_type([ 44 | command/0, 45 | key/0, 46 | op/0, 47 | read_op/0, 48 | priority/0 49 | ]). 50 | 51 | -export_type([ 52 | call_error_type/0, 53 | call_error/0, 54 | call_result/0, 55 | read_error/0, 56 | read_error_type/0, 57 | read_result/0, 58 | commit_error_type/0, 59 | commit_error/0, 60 | commit_result/0 61 | ]). 62 | 63 | -include_lib("wa_raft/include/wa_raft.hrl"). 64 | -include_lib("wa_raft/include/wa_raft_logger.hrl"). 65 | 66 | -type command() :: noop_command() | noop_omitted_command() | config_command() | dynamic(). 67 | -type noop_command() :: noop. 68 | -type noop_omitted_command() :: noop_omitted. 69 | -type config_command() :: {config, Config :: wa_raft_server:config()}. 70 | 71 | -type key() :: term(). 72 | -type op() :: {Key :: key(), Command :: command()}. 73 | -type read_op() :: {From :: gen_server:from(), Command :: command()}. 74 | -type priority() :: high | low. 75 | 76 | -type call_error_type() :: timeout | unreachable | {call_error, Reason :: term()}. 77 | -type call_error() :: {error, call_error_type()}. 78 | -type call_result() :: Result :: dynamic() | Error :: call_error(). 79 | 80 | -type read_request() :: {read, Command :: command()}. 81 | -type read_error_type() :: not_leader | read_queue_full | apply_queue_full | {notify_redirect, Peer :: node()}. 82 | -type read_error() :: {error, read_error_type()}. 83 | -type read_result() :: Result :: dynamic() | Error :: read_error() | call_error(). 84 | 85 | -type commit_request() :: {commit, Op :: op()} | {commit, Op :: op(), Priority :: priority()}. 86 | -type commit_async_request() :: {commit, From :: gen_server:from(), Op :: op()} | {commit, From :: gen_server:from(), Op :: op(), Priority :: priority()}. 87 | -type commit_error_type() :: 88 | not_leader | 89 | {commit_queue_full, Key :: key()} | 90 | {apply_queue_full, Key :: key()} | 91 | {notify_redirect, Peer :: node()} | 92 | commit_stalled | 93 | cancelled. 94 | -type commit_error() :: {error, commit_error_type()}. 95 | -type commit_result() :: Result :: dynamic() | Error :: commit_error() | call_error(). 96 | 97 | %% Acceptor state 98 | -record(state, { 99 | % Acceptor service name 100 | name :: atom(), 101 | % Server service name 102 | server :: atom(), 103 | % Queues handle 104 | queues :: wa_raft_queue:queues() 105 | }). 106 | 107 | %%------------------------------------------------------------------- 108 | %% OTP Supervision 109 | %%------------------------------------------------------------------- 110 | 111 | %%------------------------------------------------------------------- 112 | %% OTP Supervision 113 | %%------------------------------------------------------------------- 114 | 115 | -spec child_spec(Options :: #raft_options{}) -> supervisor:child_spec(). 116 | child_spec(Options) -> 117 | #{ 118 | id => ?MODULE, 119 | start => {?MODULE, start_link, [Options]}, 120 | restart => transient, 121 | shutdown => 30000, 122 | modules => [?MODULE] 123 | }. 124 | 125 | -spec start_link(Options :: #raft_options{}) -> gen_server:start_ret(). 126 | start_link(#raft_options{acceptor_name = Name} = Options) -> 127 | gen_server:start_link({local, Name}, ?MODULE, Options, []). 128 | 129 | %%------------------------------------------------------------------- 130 | %% Public API 131 | %%------------------------------------------------------------------- 132 | 133 | %% Request that the specified RAFT server commit the provided command. The commit can only be 134 | %% successful if the requested RAFT server is the active leader of the RAFT partition it is a 135 | %% part of. Returns either the result returned by the storage module when applying the command 136 | %% or an error indicating some reason for which the command was not able to be committed or 137 | %% should be retried. 138 | -spec commit(ServerRef :: gen_server:server_ref(), Op :: op()) -> commit_result(). 139 | commit(ServerRef, Op) -> 140 | commit(ServerRef, Op, ?RAFT_RPC_CALL_TIMEOUT()). 141 | 142 | -spec commit(ServerRef :: gen_server:server_ref(), Op :: op(), Timeout :: timeout()) -> commit_result(). 143 | commit(ServerRef, Op, Timeout) -> 144 | call(ServerRef, {commit, Op}, Timeout). 145 | 146 | -spec commit(ServerRef :: gen_server:server_ref(), Op :: op(), Timeout :: timeout(), Priority :: priority()) -> commit_result(). 147 | commit(ServerRef, Op, Timeout, Priority) -> 148 | call(ServerRef, {commit, Op, Priority}, Timeout). 149 | 150 | -spec commit_async(ServerRef :: gen_server:server_ref(), From :: {pid(), term()}, Op :: op()) -> ok. 151 | commit_async(ServerRef, From, Op) -> 152 | gen_server:cast(ServerRef, {commit, From, Op}). 153 | 154 | -spec commit_async(ServerRef :: gen_server:server_ref(), From :: {pid(), term()}, Op :: op(), Priority :: priority()) -> ok. 155 | commit_async(ServerRef, From, Op, Priority) -> 156 | gen_server:cast(ServerRef, {commit, From, Op, Priority}). 157 | 158 | % Strong-read 159 | -spec read(ServerRef :: gen_server:server_ref(), Command :: command()) -> read_result(). 160 | read(ServerRef, Command) -> 161 | read(ServerRef, Command, ?RAFT_RPC_CALL_TIMEOUT()). 162 | 163 | -spec read(ServerRef :: gen_server:server_ref(), Command :: command(), Timeout :: timeout()) -> read_result(). 164 | read(ServerRef, Command, Timeout) -> 165 | call(ServerRef, {read, Command}, Timeout). 166 | 167 | -spec call(ServerRef :: gen_server:server_ref(), Request :: term(), Timeout :: timeout()) -> call_result(). 168 | call(ServerRef, Request, Timeout) -> 169 | try 170 | gen_server:call(ServerRef, Request, Timeout) 171 | catch 172 | exit:{timeout, _} -> {error, timeout}; 173 | exit:{noproc, _} -> {error, unreachable}; 174 | exit:{{nodedown, _}, _} -> {error, unreachable}; 175 | exit:{shutdown, _} -> {error, unreachable}; 176 | exit:{Other, _} -> {error, {call_error, Other}} 177 | end. 178 | 179 | %%------------------------------------------------------------------- 180 | %% Internal API 181 | %%------------------------------------------------------------------- 182 | 183 | %% Get the default name for the RAFT acceptor server associated with the 184 | %% provided RAFT partition. 185 | -spec default_name(Table :: wa_raft:table(), Partition :: wa_raft:partition()) -> Name :: atom(). 186 | default_name(Table, Partition) -> 187 | list_to_atom("raft_acceptor_" ++ atom_to_list(Table) ++ "_" ++ integer_to_list(Partition)). 188 | 189 | %% Get the registered name for the RAFT acceptor server associated with the 190 | %% provided RAFT partition or the default name if no registration exists. 191 | -spec registered_name(Table :: wa_raft:table(), Partition :: wa_raft:partition()) -> Name :: atom(). 192 | registered_name(Table, Partition) -> 193 | case wa_raft_part_sup:options(Table, Partition) of 194 | undefined -> default_name(Table, Partition); 195 | Options -> Options#raft_options.acceptor_name 196 | end. 197 | 198 | %%------------------------------------------------------------------- 199 | %% RAFT Acceptor - Server Callbacks 200 | %%------------------------------------------------------------------- 201 | 202 | -spec init(Options :: #raft_options{}) -> {ok, #state{}}. 203 | init(#raft_options{table = Table, partition = Partition, acceptor_name = Name, server_name = Server} = Options) -> 204 | process_flag(trap_exit, true), 205 | 206 | ?RAFT_LOG_NOTICE("Acceptor[~0p] starting for partition ~0p/~0p", [Name, Table, Partition]), 207 | 208 | {ok, #state{ 209 | name = Name, 210 | server = Server, 211 | queues = wa_raft_queue:queues(Options) 212 | }}. 213 | 214 | -spec handle_call(read_request(), gen_server:from(), #state{}) -> {reply, read_result(), #state{}} | {noreply, #state{}}; 215 | (commit_request(), gen_server:from(), #state{}) -> {reply, commit_result(), #state{}} | {noreply, #state{}}. 216 | handle_call({read, Command}, From, State) -> 217 | case read_impl(From, Command, State) of 218 | continue -> {noreply, State}; 219 | {error, _} = Error -> {reply, Error, State} 220 | end; 221 | handle_call({commit, Op}, From, State) -> 222 | ?MODULE:handle_call({commit, Op, high}, From, State); 223 | handle_call({commit, Op, Priority}, From, State) -> 224 | case commit_impl(From, Op, Priority, State) of 225 | continue -> {noreply, State}; 226 | {error, _} = Error -> {reply, Error, State} 227 | end; 228 | handle_call(Request, From, #state{name = Name} = State) -> 229 | ?RAFT_LOG_ERROR("Acceptor[~0p] received unexpected call ~0P from ~0p.", [Name, Request, 30, From]), 230 | {noreply, State}. 231 | 232 | -spec handle_cast(commit_async_request(), #state{}) -> {noreply, #state{}}. 233 | handle_cast({commit, From, Op}, State) -> 234 | ?MODULE:handle_cast({commit, From, Op, high}, State); 235 | handle_cast({commit, From, Op, Priority}, State) -> 236 | Result = commit_impl(From, Op, Priority, State), 237 | Result =/= continue andalso gen_server:reply(From, Result), 238 | {noreply, State}; 239 | handle_cast(Request, #state{name = Name} = State) -> 240 | ?RAFT_LOG_ERROR("Acceptor[~0p] received unexpected cast ~0P.", [Name, Request, 30]), 241 | {noreply, State}. 242 | 243 | -spec terminate(Reason :: term(), State :: #state{}) -> ok. 244 | terminate(Reason, #state{name = Name}) -> 245 | ?RAFT_LOG_NOTICE("Acceptor[~0p] terminating with reason ~0P", [Name, Reason, 30]), 246 | ok. 247 | 248 | %%------------------------------------------------------------------- 249 | %% RAFT Acceptor - Implementations 250 | %%------------------------------------------------------------------- 251 | 252 | %% Enqueue a commit. 253 | -spec commit_impl(From :: gen_server:from(), Request :: op(), Priority :: priority(), State :: #state{}) -> continue | commit_error(). 254 | commit_impl(From, {Key, _} = Op, Priority, #state{name = Name, server = Server, queues = Queues}) -> 255 | StartT = os:timestamp(), 256 | try 257 | ?RAFT_LOG_DEBUG("Acceptor[~0p] starts to handle commit of ~0P from ~0p.", [Name, Op, 30, From]), 258 | case wa_raft_queue:commit_started(Queues, Priority) of 259 | commit_queue_full -> 260 | ?RAFT_LOG_WARNING( 261 | "Acceptor[~0p] is rejecting commit request from ~0p because the commit queue is full.", 262 | [Name, From] 263 | ), 264 | ?RAFT_COUNT({'raft.acceptor.error.commit_queue_full', Priority}), 265 | {error, {commit_queue_full, Key}}; 266 | apply_queue_full -> 267 | ?RAFT_LOG_WARNING( 268 | "Acceptor[~0p] is rejecting commit request from ~0p because the apply queue is full.", 269 | [Name, From] 270 | ), 271 | ?RAFT_COUNT('raft.acceptor.error.apply_queue_full'), 272 | {error, {apply_queue_full, Key}}; 273 | ok -> 274 | wa_raft_server:commit(Server, From, Op, Priority), 275 | continue 276 | end 277 | after 278 | ?RAFT_GATHER('raft.acceptor.commit.func', timer:now_diff(os:timestamp(), StartT)) 279 | end. 280 | 281 | %% Enqueue a strongly-consistent read. 282 | -spec read_impl(gen_server:from(), command(), #state{}) -> continue | read_error(). 283 | read_impl(From, Command, #state{name = Name, server = Server, queues = Queues}) -> 284 | StartT = os:timestamp(), 285 | ?RAFT_LOG_DEBUG("Acceptor[~p] starts to handle read of ~0P from ~0p.", [Name, Command, 100, From]), 286 | try 287 | case wa_raft_queue:reserve_read(Queues) of 288 | read_queue_full -> 289 | ?RAFT_COUNT('raft.acceptor.strong_read.error.read_queue_full'), 290 | ?RAFT_LOG_WARNING( 291 | "Acceptor[~0p] is rejecting read request from ~0p because the read queue is full.", 292 | [Name, From] 293 | ), 294 | {error, read_queue_full}; 295 | apply_queue_full -> 296 | ?RAFT_COUNT('raft.acceptor.strong_read.error.apply_queue_full'), 297 | ?RAFT_LOG_WARNING( 298 | "Acceptor[~0p] is rejecting read request from ~0p because the apply queue is full.", 299 | [Name, From] 300 | ), 301 | {error, apply_queue_full}; 302 | ok -> 303 | wa_raft_server:read(Server, {From, Command}), 304 | continue 305 | end 306 | after 307 | ?RAFT_GATHER('raft.acceptor.strong_read.func', timer:now_diff(os:timestamp(), StartT)) 308 | end. 309 | -------------------------------------------------------------------------------- /src/wa_raft_log_catchup.erl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | %%% 6 | %%% This module manages data catchup to followers. 7 | 8 | -module(wa_raft_log_catchup). 9 | -compile(warn_missing_spec_all). 10 | -behaviour(gen_server). 11 | 12 | -include_lib("wa_raft/include/wa_raft.hrl"). 13 | -include_lib("wa_raft/include/wa_raft_logger.hrl"). 14 | -include_lib("wa_raft/include/wa_raft_rpc.hrl"). 15 | 16 | %% Private API 17 | -export([ 18 | init_tables/0 19 | ]). 20 | 21 | %% Supervisor callbacks 22 | -export([ 23 | child_spec/1, 24 | start_link/1 25 | ]). 26 | 27 | %% Internal API 28 | -export([ 29 | default_name/2, 30 | registered_name/2 31 | ]). 32 | 33 | %% RAFT catchup server implementation 34 | -export([ 35 | init/1, 36 | handle_call/3, 37 | handle_cast/2, 38 | handle_info/2, 39 | terminate/2 40 | ]). 41 | 42 | %% API 43 | -export([ 44 | start_catchup_request/6, 45 | cancel_catchup_request/2, 46 | is_catching_up/2 47 | ]). 48 | 49 | %% RAFT log catchup server state 50 | -record(state, { 51 | application :: atom(), 52 | name :: atom(), 53 | table :: wa_raft:table(), 54 | partition :: wa_raft:partition(), 55 | self :: #raft_identity{}, 56 | identifier :: #raft_identifier{}, 57 | 58 | distribution_module :: module(), 59 | log :: wa_raft_log:log(), 60 | server_name :: atom(), 61 | 62 | lockouts = #{} :: #{#raft_identity{} => non_neg_integer()} 63 | }). 64 | 65 | %% Returning a timeout of `0` to a `gen_server` results in the `gen_server` 66 | %% continuing to process any incoming messages in the message queue and 67 | %% triggering a timeout if and only if there are no messages in the message 68 | %% queue. This server uses this to periodically inspect the message queue 69 | %% and perform log catchup work when there are no messages to process. 70 | -define(CONTINUE_TIMEOUT, 0). 71 | 72 | %% Time to wait before checking the request ETS table for any incoming 73 | %% catchup requests. 74 | -define(IDLE_TIMEOUT, 100). 75 | 76 | %% Time to wait after the completion of a log catchup before starting another 77 | %% log catchup to the same follower. This time should be at least the time 78 | %% it takes for a full heartbeat internval and then the round trip for the 79 | %% heartbeat and response. 80 | -define(LOCKOUT_PERIOD, 1000). 81 | 82 | %% An entry in the catchup request ETS table representing a request to 83 | %% trigger log catchup for a particular peer. 84 | -define(CATCHUP_REQUEST(Peer, FollowerLastIndex, LeaderTerm, LeaderCommitIndex, Witness), {Peer, FollowerLastIndex, LeaderTerm, LeaderCommitIndex, Witness}). 85 | 86 | %% An entry in the catchup ETS table that indicates an in-progress log 87 | %% catchup to the specified node. 88 | -define(CATCHUP_RECORD(Catchup, Node), {Catchup, Node}). 89 | 90 | %% Global key in persistent_term holding an atomic counters reference for 91 | %% limiting the total number of concurrent catchup by bulk logs transfer. 92 | -define(COUNTER_KEY, {?MODULE, counters}). 93 | %% Index of the counter tracking the number of concurrent bulk logs transfer. 94 | -define(COUNTER_CONCURRENT_CATCHUP, 1). 95 | %% Total number of counters 96 | -define(COUNTER_COUNT, 1). 97 | 98 | -spec init_tables() -> term(). 99 | init_tables() -> 100 | persistent_term:put(?COUNTER_KEY, counters:new(?COUNTER_COUNT, [atomics])), 101 | ?MODULE = ets:new(?MODULE, [set, public, named_table, {read_concurrency, true}]). 102 | 103 | -spec child_spec(Options :: #raft_options{}) -> supervisor:child_spec(). 104 | child_spec(Options) -> 105 | #{ 106 | id => ?MODULE, 107 | start => {?MODULE, start_link, [Options]}, 108 | restart => transient, 109 | shutdown => 30000, 110 | modules => [?MODULE] 111 | }. 112 | 113 | -spec start_link(Options :: #raft_options{}) -> supervisor:startlink_ret(). 114 | start_link(#raft_options{log_catchup_name = Name} = Options) -> 115 | gen_server:start_link({local, Name}, ?MODULE, Options, []). 116 | 117 | %% Submit a request to trigger log catchup for a particular follower starting at the index provided. 118 | -spec start_catchup_request(Catchup :: atom(), Peer :: #raft_identity{}, FollowerLastIndex :: wa_raft_log:log_index(), 119 | LeaderTerm :: wa_raft_log:log_term(), LeaderCommitIndex :: wa_raft_log:log_index(), 120 | Witness :: boolean()) -> ok. 121 | start_catchup_request(Catchup, Peer, FollowerLastIndex, LeaderTerm, LeaderCommitIndex, Witness) -> 122 | ets:insert(Catchup, ?CATCHUP_REQUEST(Peer, FollowerLastIndex, LeaderTerm, LeaderCommitIndex, Witness)), 123 | ok. 124 | 125 | %% Cancel a request to trigger log catchup for a particular follower. 126 | -spec cancel_catchup_request(Catchup :: atom(), Peer :: #raft_identity{}) -> ok. 127 | cancel_catchup_request(Catchup, Peer) -> 128 | ets:delete(Catchup, Peer), 129 | ok. 130 | 131 | %% Returns whether or not there exists an in-progress log catchup to the 132 | %% specified follower on the specified catchup server. 133 | -spec is_catching_up(Catchup :: atom(), Peer :: #raft_identity{}) -> boolean(). 134 | is_catching_up(Catchup, Peer) -> 135 | case ets:lookup(?MODULE, Catchup) of 136 | [?CATCHUP_RECORD(_, Peer)] -> true; 137 | _ -> false 138 | end. 139 | 140 | %%------------------------------------------------------------------- 141 | %% Internal API 142 | %%------------------------------------------------------------------- 143 | 144 | %% Get the default name for the RAFT log catchup server associated with the 145 | %% provided RAFT partition. 146 | -spec default_name(Table :: wa_raft:table(), Partition :: wa_raft:partition()) -> Name :: atom(). 147 | default_name(Table, Partition) -> 148 | % elp:ignore W0023 (atoms_exhaustion) - Limit set of inputs 149 | list_to_atom("raft_log_catchup_" ++ atom_to_list(Table) ++ "_" ++ integer_to_list(Partition)). 150 | 151 | %% Get the registered name for the RAFT log catchup server associated with the 152 | %% provided RAFT partition or the default name if no registration exists. 153 | -spec registered_name(Table :: wa_raft:table(), Partition :: wa_raft:partition()) -> Name :: atom(). 154 | registered_name(Table, Partition) -> 155 | case wa_raft_part_sup:options(Table, Partition) of 156 | undefined -> default_name(Table, Partition); 157 | Options -> Options#raft_options.log_catchup_name 158 | end. 159 | 160 | %% RAFT log catchup server implementation 161 | -spec init(Options :: #raft_options{}) -> {ok, #state{}, timeout()}. 162 | init(#raft_options{application = Application, table = Table, partition = Partition, self = Self, 163 | identifier = Identifier, distribution_module = DistributionModule, 164 | log_name = LogName, log_module = LogModule, 165 | log_catchup_name = Name, server_name = Server}) -> 166 | process_flag(trap_exit, true), 167 | 168 | ?RAFT_LOG_NOTICE("Catchup[~0p] starting for partition ~0p/~0p", [Name, Table, Partition]), 169 | 170 | Name = ets:new(Name, [set, public, named_table, {write_concurrency, true}]), 171 | Log = #raft_log{ 172 | name = LogName, 173 | application = Application, 174 | table = Table, 175 | partition = Partition, 176 | provider = LogModule 177 | }, 178 | State = #state{ 179 | application = Application, 180 | name = Name, 181 | table = Table, 182 | partition = Partition, 183 | self = Self, 184 | identifier = Identifier, 185 | distribution_module = DistributionModule, 186 | log = Log, 187 | server_name = Server 188 | }, 189 | 190 | {ok, State, ?CONTINUE_TIMEOUT}. 191 | 192 | -spec handle_call(Request :: term(), From :: gen_server:from(), State :: #state{}) -> {noreply, #state{}, timeout()}. 193 | handle_call(Request, From, #state{name = Name} = State) -> 194 | ?RAFT_LOG_WARNING("Unexpected call ~0P from ~0p on ~0p", [Request, 30, From, Name]), 195 | {noreply, State, ?CONTINUE_TIMEOUT}. 196 | 197 | -spec handle_cast(Request :: term(), State :: #state{}) -> {noreply, #state{}, timeout()}. 198 | handle_cast(Request, #state{name = Name} = State) -> 199 | ?RAFT_LOG_WARNING("Unexpected cast ~0P on ~0p", [Request, 30, Name]), 200 | {noreply, State, ?CONTINUE_TIMEOUT}. 201 | 202 | -spec handle_info(Info :: timeout, State :: #state{}) -> {noreply, #state{}, timeout()}. 203 | handle_info(timeout, #state{name = Name} = State) -> 204 | case ets:tab2list(Name) of 205 | [] -> 206 | {noreply, State, ?IDLE_TIMEOUT}; 207 | Requests -> 208 | % Select a random log catchup request to process. 209 | ?CATCHUP_REQUEST(Peer, FollowerLastIndex, LeaderTerm, LeaderCommitIndex, Witness) = lists:nth(rand:uniform(length(Requests)), Requests), 210 | NewState = send_logs(Peer, FollowerLastIndex, LeaderTerm, LeaderCommitIndex, Witness, State), 211 | {noreply, NewState, ?CONTINUE_TIMEOUT} 212 | end; 213 | handle_info(Info, #state{name = Name} = State) -> 214 | ?RAFT_LOG_WARNING("Unexpected info ~0P on ~0p", [Info, 30, Name]), 215 | {noreply, State, ?CONTINUE_TIMEOUT}. 216 | 217 | -spec terminate(Reason :: term(), State :: #state{}) -> term(). 218 | terminate(_Reason, #state{name = Name}) -> 219 | ets:delete(?MODULE, Name). 220 | 221 | %% ======================================================================= 222 | %% Private functions - Send logs to follower 223 | %% 224 | 225 | -spec send_logs(#raft_identity{}, wa_raft_log:log_index(), wa_raft_log:log_term(), wa_raft_log:log_index(), boolean(), #state{}) -> #state{}. 226 | send_logs(Peer, NextLogIndex, LeaderTerm, LeaderCommitIndex, Witness, #state{name = Name, lockouts = Lockouts} = State) -> 227 | StartMillis = erlang:system_time(millisecond), 228 | LockoutMillis = maps:get(Peer, Lockouts, 0), 229 | NewState = case LockoutMillis =< StartMillis of 230 | true -> 231 | Counters = persistent_term:get(?COUNTER_KEY), 232 | case counters:get(Counters, ?COUNTER_CONCURRENT_CATCHUP) < ?RAFT_MAX_CONCURRENT_LOG_CATCHUP() of 233 | true -> 234 | counters:add(Counters, ?COUNTER_CONCURRENT_CATCHUP, 1), 235 | ets:insert(?MODULE, ?CATCHUP_RECORD(Name, Peer)), 236 | try send_logs_impl(Peer, NextLogIndex, LeaderTerm, LeaderCommitIndex, Witness, State) catch 237 | T:E:S -> 238 | ?RAFT_COUNT('raft.catchup.error'), 239 | ?RAFT_LOG_ERROR( 240 | "Catchup[~p, term ~p] bulk logs transfer to ~0p failed with ~0p ~0p at ~p", 241 | [Name, LeaderTerm, Peer, T, E, S] 242 | ) 243 | after 244 | counters:sub(persistent_term:get(?COUNTER_KEY), ?COUNTER_CONCURRENT_CATCHUP, 1) 245 | end, 246 | EndMillis = erlang:system_time(millisecond), 247 | ?RAFT_GATHER('raft.leader.catchup.duration', (EndMillis - StartMillis) * 1000), 248 | State#state{lockouts = Lockouts#{Peer => EndMillis + ?LOCKOUT_PERIOD}}; 249 | false -> 250 | State 251 | end; 252 | false -> 253 | ?RAFT_LOG_NOTICE( 254 | "Catchup[~p, term ~p] skipping bulk logs transfer to ~0p because follower is still under lockout.", 255 | [Name, LeaderTerm, Peer] 256 | ), 257 | State 258 | end, 259 | ets:delete(Name, Peer), 260 | ets:delete(?MODULE, Name), 261 | NewState. 262 | 263 | -spec send_logs_impl(#raft_identity{}, wa_raft_log:log_index(), wa_raft_log:log_term(), wa_raft_log:log_index(), boolean(), #state{}) -> term(). 264 | send_logs_impl(#raft_identity{node = PeerNode} = Peer, NextLogIndex, LeaderTerm, LeaderCommitIndex, Witness, 265 | #state{application = App, name = Name, self = Self, identifier = Identifier, distribution_module = DistributionModule, server_name = Server, log = Log} = State) -> 266 | PrevLogIndex = NextLogIndex - 1, 267 | {ok, PrevLogTerm} = wa_raft_log:term(Log, PrevLogIndex), 268 | 269 | LogBatchEntries = ?RAFT_CATCHUP_MAX_ENTRIES_PER_BATCH(App), 270 | LogBatchBytes = ?RAFT_CATCHUP_MAX_BYTES_PER_BATCH(App), 271 | Entries = case Witness of 272 | true -> 273 | {ok, RawEntries} = wa_raft_log:get(Log, NextLogIndex, LogBatchEntries, LogBatchBytes), 274 | wa_raft_server:stub_entries_for_witness(RawEntries); 275 | false -> 276 | {ok, RawEntries} = wa_raft_log:entries(Log, NextLogIndex, LogBatchEntries, LogBatchBytes), 277 | RawEntries 278 | end, 279 | 280 | case Entries of 281 | [] -> 282 | ?RAFT_LOG_NOTICE( 283 | "Catchup[~0p, term ~p] finishes bulk logs transfer to follower ~0p at ~0p.", 284 | [Name, LeaderTerm, Peer, NextLogIndex] 285 | ); 286 | _ -> 287 | % Replicate the log entries to our peer. 288 | Dest = {Server, PeerNode}, 289 | Command = wa_raft_server:make_rpc(Self, LeaderTerm, ?APPEND_ENTRIES(PrevLogIndex, PrevLogTerm, Entries, LeaderCommitIndex, 0)), 290 | Timeout = ?RAFT_CATCHUP_HEARTBEAT_TIMEOUT(), 291 | 292 | try wa_raft_server:parse_rpc(Self, DistributionModule:call(Dest, Identifier, Command, Timeout)) of 293 | {LeaderTerm, _, ?APPEND_ENTRIES_RESPONSE(PrevLogIndex, true, FollowerMatchIndex, _)} -> 294 | send_logs_impl(Peer, FollowerMatchIndex + 1, LeaderTerm, LeaderCommitIndex, Witness, State); 295 | {LeaderTerm, _, ?APPEND_ENTRIES_RESPONSE(PrevLogIndex, false, _FollowerLastIndex, _)} -> 296 | exit(append_failed); 297 | {LeaderTerm, _, Other} -> 298 | exit({bad_response, Other}); 299 | {NewTerm, _, _} -> 300 | exit({new_term, NewTerm}) 301 | catch 302 | % Suppress any `gen_server:call` regurgitation of the potentially large append payload into 303 | % an error report to avoid excessively large error reports. 304 | exit:{Reason, _} -> exit(Reason) 305 | end 306 | end. 307 | -------------------------------------------------------------------------------- /src/wa_raft_queue.erl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | %%% 6 | %%% This module implements tracking of pending requests and queue limits. 7 | 8 | -module(wa_raft_queue). 9 | -compile(warn_missing_spec_all). 10 | -behaviour(gen_server). 11 | 12 | %% PUBLIC API 13 | -export([ 14 | queues/1, 15 | queues/2, 16 | commit_queue_size/2, 17 | commit_queue_size/3, 18 | commit_queue_full/2, 19 | commit_queue_full/3, 20 | apply_queue_size/1, 21 | apply_queue_size/2, 22 | apply_queue_byte_size/1, 23 | apply_queue_byte_size/2, 24 | apply_queue_full/1, 25 | apply_queue_full/2 26 | ]). 27 | 28 | %% INTERNAL API 29 | -export([ 30 | default_name/2, 31 | default_counters/0, 32 | default_read_queue_name/2, 33 | registered_name/2 34 | ]). 35 | 36 | %% PENDING COMMIT QUEUE API 37 | -export([ 38 | commit_started/2, 39 | commit_cancelled/4, 40 | commit_completed/4 41 | ]). 42 | 43 | %% PENDING READ API 44 | -export([ 45 | reserve_read/1, 46 | submit_read/4, 47 | query_reads/2, 48 | fulfill_read/3, 49 | fulfill_incomplete_read/3, 50 | fulfill_all_reads/2 51 | ]). 52 | 53 | %% APPLY QUEUE API 54 | -export([ 55 | reserve_apply/2, 56 | fulfill_apply/2 57 | ]). 58 | 59 | %% OTP SUPERVISION 60 | -export([ 61 | child_spec/1, 62 | start_link/1 63 | ]). 64 | 65 | %% QUEUE SERVER CALLBACKS 66 | -export([ 67 | init/1, 68 | handle_call/3, 69 | handle_cast/2, 70 | terminate/2 71 | ]). 72 | 73 | %% TYPES 74 | -export_type([ 75 | queues/0 76 | ]). 77 | 78 | -include_lib("stdlib/include/ms_transform.hrl"). % used by ets:fun2ms 79 | -include_lib("wa_raft/include/wa_raft.hrl"). 80 | -include_lib("wa_raft/include/wa_raft_logger.hrl"). 81 | 82 | %%------------------------------------------------------------------- 83 | 84 | %% ETS table creation options shared by all queue tables 85 | -define(RAFT_QUEUE_TABLE_OPTIONS, [named_table, public, {read_concurrency, true}, {write_concurrency, true}]). 86 | 87 | %% Total number of counters for RAFT partition specfic counters 88 | -define(RAFT_NUMBER_OF_QUEUE_SIZE_COUNTERS, 5). 89 | %% Index into counter reference for counter tracking apply queue size 90 | -define(RAFT_APPLY_QUEUE_SIZE_COUNTER, 1). 91 | %% Index into counter reference for counter tracking apply total byte size 92 | -define(RAFT_APPLY_QUEUE_BYTE_SIZE_COUNTER, 2). 93 | %% Index into counter reference for counter tracking high priority commit queue size 94 | -define(RAFT_HIGH_PRIORITY_COMMIT_QUEUE_SIZE_COUNTER, 3). 95 | %% Index into counter reference for counter tracking low priority commit queue size 96 | -define(RAFT_LOW_PRIORITY_COMMIT_QUEUE_SIZE_COUNTER, 4). 97 | %% Index into counter reference for counter tracking read queue size 98 | -define(RAFT_READ_QUEUE_SIZE_COUNTER, 5). 99 | 100 | %%------------------------------------------------------------------- 101 | %% INTERNAL TYPES 102 | %%------------------------------------------------------------------- 103 | 104 | -record(state, { 105 | name :: atom() 106 | }). 107 | 108 | -record(queues, { 109 | application :: atom(), 110 | counters :: atomics:atomics_ref(), 111 | reads :: atom() 112 | }). 113 | -opaque queues() :: #queues{}. 114 | 115 | %%------------------------------------------------------------------- 116 | %% PUBLIC API 117 | %%------------------------------------------------------------------- 118 | 119 | -spec queues(Options :: #raft_options{}) -> Queues :: queues(). 120 | queues(Options) -> 121 | #queues{ 122 | application = Options#raft_options.application, 123 | counters = Options#raft_options.queue_counters, 124 | reads = Options#raft_options.queue_reads 125 | }. 126 | 127 | -spec queues(Table :: wa_raft:table(), Partition :: wa_raft:partition()) -> Queues :: queues() | undefined. 128 | queues(Table, Partition) -> 129 | case wa_raft_part_sup:options(Table, Partition) of 130 | undefined -> undefined; 131 | Options -> queues(Options) 132 | end. 133 | 134 | -spec commit_queue_size(Queues :: queues(), Priority :: wa_raft_acceptor:priority()) -> non_neg_integer(). 135 | commit_queue_size(#queues{counters = Counters}, high) -> 136 | atomics:get(Counters, ?RAFT_HIGH_PRIORITY_COMMIT_QUEUE_SIZE_COUNTER); 137 | commit_queue_size(#queues{counters = Counters}, low) -> 138 | atomics:get(Counters, ?RAFT_LOW_PRIORITY_COMMIT_QUEUE_SIZE_COUNTER). 139 | 140 | -spec commit_queue_size(Table :: wa_raft:table(), Partition :: wa_raft:partition(), Priority :: wa_raft_acceptor:priority()) -> non_neg_integer(). 141 | commit_queue_size(Table, Partition, Priority) -> 142 | case queues(Table, Partition) of 143 | undefined -> 0; 144 | Queue -> commit_queue_size(Queue, Priority) 145 | end. 146 | 147 | -spec commit_queue_full(Queues :: queues(), Priority :: wa_raft_acceptor:priority()) -> boolean(). 148 | commit_queue_full(#queues{application = App, counters = Counters}, high) -> 149 | atomics:get(Counters, ?RAFT_HIGH_PRIORITY_COMMIT_QUEUE_SIZE_COUNTER) >= ?RAFT_MAX_PENDING_HIGH_PRIORITY_COMMITS(App); 150 | commit_queue_full(#queues{application = App, counters = Counters}, low) -> 151 | atomics:get(Counters, ?RAFT_LOW_PRIORITY_COMMIT_QUEUE_SIZE_COUNTER) >= ?RAFT_MAX_PENDING_LOW_PRIORITY_COMMITS(App). 152 | 153 | -spec commit_queue_full(Table :: wa_raft:table(), Partition :: wa_raft:partition(), Priority :: wa_raft_acceptor:priority()) -> boolean(). 154 | commit_queue_full(Table, Partition, Priority) -> 155 | case queues(Table, Partition) of 156 | undefined -> false; 157 | Queues -> commit_queue_full(Queues, Priority) 158 | end. 159 | 160 | -spec apply_queue_size(Queues :: queues()) -> non_neg_integer(). 161 | apply_queue_size(#queues{counters = Counters}) -> 162 | atomics:get(Counters, ?RAFT_APPLY_QUEUE_SIZE_COUNTER). 163 | 164 | -spec apply_queue_size(Table :: wa_raft:table(), Partition :: wa_raft:partition()) -> non_neg_integer(). 165 | apply_queue_size(Table, Partition) -> 166 | case queues(Table, Partition) of 167 | undefined -> 0; 168 | Queues -> apply_queue_size(Queues) 169 | end. 170 | 171 | -spec apply_queue_byte_size(Queues :: queues()) -> non_neg_integer(). 172 | apply_queue_byte_size(#queues{counters = Counters}) -> 173 | atomics:get(Counters, ?RAFT_APPLY_QUEUE_BYTE_SIZE_COUNTER). 174 | 175 | -spec apply_queue_byte_size(wa_raft:table(), wa_raft:partition()) -> non_neg_integer(). 176 | apply_queue_byte_size(Table, Partition) -> 177 | case queues(Table, Partition) of 178 | undefined -> 0; 179 | Queues -> apply_queue_byte_size(Queues) 180 | end. 181 | 182 | -spec apply_queue_full(Queues :: queues()) -> boolean(). 183 | apply_queue_full(#queues{application = App, counters = Counters}) -> 184 | atomics:get(Counters, ?RAFT_APPLY_QUEUE_SIZE_COUNTER) >= ?RAFT_MAX_PENDING_APPLIES(App) orelse 185 | atomics:get(Counters, ?RAFT_APPLY_QUEUE_BYTE_SIZE_COUNTER) >= ?RAFT_MAX_PENDING_APPLY_BYTES(App). 186 | 187 | -spec apply_queue_full(wa_raft:table(), wa_raft:partition()) -> boolean(). 188 | apply_queue_full(Table, Partition) -> 189 | case queues(Table, Partition) of 190 | undefined -> false; 191 | Queues -> apply_queue_full(Queues) 192 | end. 193 | 194 | %%------------------------------------------------------------------- 195 | %% INTERNAL API 196 | %%------------------------------------------------------------------- 197 | 198 | %% Get the default name for the RAFT queue server associated with the 199 | %% provided RAFT partition. 200 | -spec default_name(Table :: wa_raft:table(), Partition :: wa_raft:partition()) -> Name :: atom(). 201 | default_name(Table, Partition) -> 202 | binary_to_atom(<<"raft_queue_", (atom_to_binary(Table))/bytes, "_", (integer_to_binary(Partition))/bytes>>). 203 | 204 | %% Create a properly-sized atomics array for use by a RAFT queue 205 | -spec default_counters() -> Counters :: atomics:atomics_ref(). 206 | default_counters() -> 207 | atomics:new(?RAFT_NUMBER_OF_QUEUE_SIZE_COUNTERS, []). 208 | 209 | %% Get the default name for the RAFT read queue ETS table associated with the 210 | %% provided RAFT partition. 211 | -spec default_read_queue_name(Table :: wa_raft:table(), Partition :: wa_raft:partition()) -> Name :: atom(). 212 | default_read_queue_name(Table, Partition) -> 213 | binary_to_atom(<<"raft_read_queue_", (atom_to_binary(Table))/bytes, "_", (integer_to_binary(Partition))/bytes>>). 214 | 215 | %% Get the registered name for the RAFT queue server associated with the 216 | %% provided RAFT partition or the default name if no registration exists. 217 | -spec registered_name(Table :: wa_raft:table(), Partition :: wa_raft:partition()) -> Name :: atom(). 218 | registered_name(Table, Partition) -> 219 | case wa_raft_part_sup:options(Table, Partition) of 220 | undefined -> default_name(Table, Partition); 221 | Options -> Options#raft_options.queue_name 222 | end. 223 | 224 | %%------------------------------------------------------------------- 225 | %% PENDING COMMIT QUEUE API 226 | %%------------------------------------------------------------------- 227 | 228 | -spec commit_started(Queues :: queues(), Priority :: wa_raft_acceptor:priority()) -> ok | apply_queue_full | commit_queue_full. 229 | commit_started(#queues{counters = Counters} = Queues, Priority) -> 230 | case commit_queue_full(Queues, Priority) of 231 | true -> 232 | ?RAFT_COUNT({'raft.acceptor.commit.queue.full', Priority}), 233 | commit_queue_full; 234 | false -> 235 | case apply_queue_full(Queues) of 236 | true -> 237 | apply_queue_full; 238 | false -> 239 | PendingCommits = 240 | case Priority of 241 | high -> 242 | atomics:add_get(Counters, ?RAFT_HIGH_PRIORITY_COMMIT_QUEUE_SIZE_COUNTER, 1); 243 | low -> 244 | atomics:add_get(Counters, ?RAFT_LOW_PRIORITY_COMMIT_QUEUE_SIZE_COUNTER, 1) 245 | end, 246 | ?RAFT_GATHER({'raft.acceptor.commit.request.pending', Priority}, PendingCommits), 247 | ok 248 | end 249 | end. 250 | 251 | 252 | -spec commit_cancelled(Queues :: queues(), From :: gen_server:from(), Reason :: wa_raft_acceptor:commit_error() | undefined, Priority :: wa_raft_acceptor:priority()) -> ok. 253 | commit_cancelled(#queues{counters = Counters}, From, Reason, Priority) -> 254 | case Priority of 255 | high -> 256 | atomics:sub(Counters, ?RAFT_HIGH_PRIORITY_COMMIT_QUEUE_SIZE_COUNTER, 1); 257 | low -> 258 | atomics:sub(Counters, ?RAFT_LOW_PRIORITY_COMMIT_QUEUE_SIZE_COUNTER, 1) 259 | end, 260 | Reason =/= undefined andalso gen_server:reply(From, Reason), 261 | ok. 262 | 263 | -spec commit_completed(Queues :: queues(), From :: gen_server:from(), Reply :: term(), Priority :: wa_raft_acceptor:priority()) -> ok. 264 | commit_completed(#queues{counters = Counters}, From, Reply, Priority) -> 265 | case Priority of 266 | high -> 267 | atomics:sub(Counters, ?RAFT_HIGH_PRIORITY_COMMIT_QUEUE_SIZE_COUNTER, 1); 268 | low -> 269 | atomics:sub(Counters, ?RAFT_LOW_PRIORITY_COMMIT_QUEUE_SIZE_COUNTER, 1) 270 | end, 271 | gen_server:reply(From, Reply), 272 | ok. 273 | 274 | %%------------------------------------------------------------------- 275 | %% PENDING READ QUEUE API 276 | %%------------------------------------------------------------------- 277 | 278 | % Inspects the read and apply queues to check if a strong read is allowed 279 | % to be submitted to the RAFT server currently. If so, then returns 'ok' 280 | % and increments the read counter. Inspecting the queues and actually 281 | % adding the read request to the table are done in two stages for reads 282 | % because the acceptor does not have enough information to add the read 283 | % to the ETS table directly. 284 | -spec reserve_read(Queues :: queues()) -> ok | read_queue_full | apply_queue_full. 285 | reserve_read(#queues{application = App, counters = Counters}) -> 286 | PendingReads = atomics:get(Counters, ?RAFT_READ_QUEUE_SIZE_COUNTER), 287 | case PendingReads >= ?RAFT_MAX_PENDING_READS(App) of 288 | true -> read_queue_full; 289 | false -> 290 | case atomics:get(Counters, ?RAFT_APPLY_QUEUE_SIZE_COUNTER) >= ?RAFT_MAX_PENDING_APPLIES(App) of 291 | true -> apply_queue_full; 292 | false -> 293 | ?RAFT_GATHER('raft.acceptor.strong_read.request.pending', PendingReads + 1), 294 | atomics:add(Counters, ?RAFT_READ_QUEUE_SIZE_COUNTER, 1), 295 | ok 296 | end 297 | end. 298 | 299 | % Called from the RAFT server once it knows the proper ReadIndex for the 300 | % read request to add the read request to the reads table for storage 301 | % to handle upon applying. 302 | -spec submit_read(Queues :: queues(), wa_raft_log:log_index(), term(), term()) -> ok. 303 | submit_read(#queues{reads = Reads}, ReadIndex, From, Command) -> 304 | ets:insert(Reads, {{ReadIndex, make_ref()}, From, Command}), 305 | ok. 306 | 307 | -spec query_reads(Queues :: queues(), wa_raft_log:log_index() | infinity) -> [{{wa_raft_log:log_index(), reference()}, term()}]. 308 | query_reads(#queues{reads = Reads}, MaxLogIndex) -> 309 | MatchSpec = ets:fun2ms( 310 | fun({{LogIndex, Reference}, _, Command}) when LogIndex =< MaxLogIndex -> 311 | {{LogIndex, Reference}, Command} 312 | end 313 | ), 314 | ets:select(Reads, MatchSpec). 315 | 316 | -spec fulfill_read(Queues :: queues(), term(), dynamic()) -> ok | not_found. 317 | fulfill_read(#queues{counters = Counters, reads = Reads}, Reference, Reply) -> 318 | case ets:take(Reads, Reference) of 319 | [{Reference, From, _}] -> 320 | atomics:sub(Counters, ?RAFT_READ_QUEUE_SIZE_COUNTER, 1), 321 | gen_server:reply(From, Reply); 322 | [] -> 323 | not_found 324 | end. 325 | 326 | % Complete a read that was reserved by the RAFT acceptor but was rejected 327 | % before it could be added to the read queue and so has no reference. 328 | -spec fulfill_incomplete_read(Queues :: queues(), gen_server:from(), wa_raft_acceptor:read_error()) -> ok. 329 | fulfill_incomplete_read(#queues{counters = Counters}, From, Reply) -> 330 | atomics:sub(Counters, ?RAFT_READ_QUEUE_SIZE_COUNTER, 1), 331 | gen_server:reply(From, Reply). 332 | 333 | % Fulfill a pending reads with an error that indicates that the read was not completed. 334 | -spec fulfill_all_reads(Queues :: queues(), wa_raft_acceptor:read_error()) -> ok. 335 | fulfill_all_reads(#queues{counters = Counters, reads = Reads}, Reply) -> 336 | lists:foreach( 337 | fun ({Reference, _, _}) -> 338 | case ets:take(Reads, Reference) of 339 | [{Reference, From, _}] -> 340 | atomics:sub(Counters, ?RAFT_READ_QUEUE_SIZE_COUNTER, 1), 341 | gen_server:reply(From, Reply); 342 | [] -> 343 | ok 344 | end 345 | end, ets:tab2list(Reads)). 346 | 347 | %%------------------------------------------------------------------- 348 | %% APPLY QUEUE API 349 | %%------------------------------------------------------------------- 350 | 351 | -spec reserve_apply(Queues :: queues(), non_neg_integer()) -> ok. 352 | reserve_apply(#queues{counters = Counters}, Size) -> 353 | atomics:add(Counters, ?RAFT_APPLY_QUEUE_SIZE_COUNTER, 1), 354 | atomics:add(Counters, ?RAFT_APPLY_QUEUE_BYTE_SIZE_COUNTER, Size). 355 | 356 | -spec fulfill_apply(Queues :: queues(), non_neg_integer()) -> ok. 357 | fulfill_apply(#queues{counters = Counters}, Size) -> 358 | atomics:sub(Counters, ?RAFT_APPLY_QUEUE_SIZE_COUNTER, 1), 359 | atomics:sub(Counters, ?RAFT_APPLY_QUEUE_BYTE_SIZE_COUNTER, Size). 360 | 361 | %%------------------------------------------------------------------- 362 | %% OTP SUPERVISION 363 | %%------------------------------------------------------------------- 364 | 365 | -spec child_spec(Options :: #raft_options{}) -> supervisor:child_spec(). 366 | child_spec(Options) -> 367 | #{ 368 | id => ?MODULE, 369 | start => {?MODULE, start_link, [Options]}, 370 | restart => transient, 371 | shutdown => 1000, 372 | modules => [?MODULE] 373 | }. 374 | 375 | -spec start_link(Options :: #raft_options{}) -> gen_server:start_ret(). 376 | start_link(#raft_options{queue_name = Name} = Options) -> 377 | gen_server:start_link({local, Name}, ?MODULE, Options, []). 378 | 379 | %%------------------------------------------------------------------- 380 | %% QUEUE SERVER CALLBACKS 381 | %%------------------------------------------------------------------- 382 | 383 | -spec init(Options :: #raft_options{}) -> {ok, #state{}}. 384 | init( 385 | #raft_options{ 386 | table = Table, 387 | partition = Partition, 388 | queue_name = Name, 389 | queue_counters = Counters, 390 | queue_reads = ReadsName 391 | } 392 | ) -> 393 | process_flag(trap_exit, true), 394 | 395 | ?RAFT_LOG_NOTICE( 396 | "Queue[~p] starting for partition ~0p/~0p with read queue ~0p", 397 | [Name, Table, Partition, ReadsName] 398 | ), 399 | 400 | % The queue process is the first process in the supervision for a single 401 | % RAFT partition. The supervisor is configured to restart all processes if 402 | % even a single process fails. Since the queue process is starting up, all 403 | % queues tracked should be empty so reset all counters. 404 | [atomics:put(Counters, Index, 0) || Index <- lists:seq(1, ?RAFT_NUMBER_OF_QUEUE_SIZE_COUNTERS)], 405 | 406 | % Create ETS table for pending reads. 407 | ReadsName = ets:new(ReadsName, [ordered_set | ?RAFT_QUEUE_TABLE_OPTIONS]), 408 | 409 | {ok, #state{name = Name}}. 410 | 411 | -spec handle_call(Request :: term(), From :: gen_server:from(), State :: #state{}) -> {noreply, #state{}}. 412 | handle_call(Request, From, #state{name = Name} = State) -> 413 | ?RAFT_LOG_NOTICE("Queue[~p] got unexpected request ~0P from ~0p", [Name, Request, 100, From]), 414 | {noreply, State}. 415 | 416 | -spec handle_cast(Request :: term(), State :: #state{}) -> {noreply, #state{}}. 417 | handle_cast(Request, #state{name = Name} = State) -> 418 | ?RAFT_LOG_NOTICE("Queue[~p] got unexpected call ~0P", [Name, Request, 100]), 419 | {noreply, State}. 420 | 421 | -spec terminate(Reason :: term(), State :: #state{}) -> term(). 422 | terminate(Reason, #state{name = Name}) -> 423 | ?RAFT_LOG_NOTICE("Queue[~p] terminating due to ~0P", [Name, Reason, 100]). 424 | -------------------------------------------------------------------------------- /include/wa_raft.hrl: -------------------------------------------------------------------------------- 1 | %%% Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 2 | %%% 3 | %%% This source code is licensed under the Apache 2.0 license found in 4 | %%% the LICENSE file in the root directory of this source tree. 5 | %%% 6 | %%% This file defines general macros and data structures shared across modules. 7 | 8 | %% The name of the RAFT application. 9 | -define(RAFT_APPLICATION, wa_raft). 10 | 11 | %%------------------------------------------------------------------- 12 | %% Registered information about applications and partitions 13 | %%------------------------------------------------------------------- 14 | 15 | %% Name of the application environment key that is expected to contain a path 16 | %% to the directory in which RAFT should place the partition paths for the 17 | %% RAFT partitions under a RAFT client application. This environment value uses 18 | %% the application search order to determine the value to use. 19 | -define(RAFT_DATABASE, raft_database). 20 | %% Default location containing databases for RAFT partitions part of a RAFT client application 21 | -define(RAFT_DATABASE_PATH(Application), (wa_raft_env:database_path(Application))). 22 | %% Registered database location for the specified RAFT partition 23 | -define(RAFT_PARTITION_PATH(Table, Partition), (wa_raft_part_sup:registered_partition_path(Table, Partition))). 24 | 25 | %% Registered name of the RAFT partition supervisor for a RAFT partition 26 | -define(RAFT_SUPERVISOR_NAME(Table, Partition), (wa_raft_part_sup:registered_name(Table, Partition))). 27 | %% Registered name of the RAFT acceptor server for a RAFT partition 28 | -define(RAFT_ACCEPTOR_NAME(Table, Partition), (wa_raft_acceptor:registered_name(Table, Partition))). 29 | %% Registered name of the RAFT log server for a RAFT partition 30 | -define(RAFT_LOG_NAME(Table, Partition), (wa_raft_log:registered_name(Table, Partition))). 31 | %% Registered name of the RAFT log catchup server for a RAFT partition 32 | -define(RAFT_LOG_CATCHUP_NAME(Table, Partition), (wa_raft_log_catchup:registered_name(Table, Partition))). 33 | %% Registered name of the RAFT server for a RAFT partition 34 | -define(RAFT_SERVER_NAME(Table, Partition), (wa_raft_server:registered_name(Table, Partition))). 35 | %% Registered name of the RAFT storage server for a RAFT partition 36 | -define(RAFT_STORAGE_NAME(Table, Partition), (wa_raft_storage:registered_name(Table, Partition))). 37 | 38 | %% Default distribution provider module 39 | -define(RAFT_DEFAULT_DISTRIBUTION_MODULE, wa_raft_distribution). 40 | %% Default log provider module 41 | -define(RAFT_DEFAULT_LOG_MODULE, wa_raft_log_ets). 42 | %% Default storage provider module 43 | -define(RAFT_DEFAULT_STORAGE_MODULE, wa_raft_storage_ets). 44 | %% Default module for handling outgoing transports 45 | -define(RAFT_DEFAULT_TRANSPORT_MODULE, wa_raft_dist_transport). 46 | %% Default module for log labeling 47 | -define(RAFT_DEFAULT_LABEL_MODULE, undefined). 48 | 49 | %% RAFT election max weight 50 | -define(RAFT_ELECTION_MAX_WEIGHT, 10). 51 | %% Raft election default weight 52 | -define(RAFT_ELECTION_DEFAULT_WEIGHT, ?RAFT_ELECTION_MAX_WEIGHT). 53 | 54 | %% Name of server state persist file 55 | -define(STATE_FILE_NAME, "state"). 56 | %% Name prefix for snapshots 57 | -define(SNAPSHOT_PREFIX, "snapshot"). 58 | %% Snapshot name 59 | -define(SNAPSHOT_NAME(Index, Term), (?SNAPSHOT_PREFIX "." ++ integer_to_list(Index) ++ "." ++ integer_to_list(Term))). 60 | 61 | %% Witness Snapshot name 62 | -define(WITNESS_SNAPSHOT_NAME(Index, Term), (?SNAPSHOT_PREFIX "." ++ integer_to_list(Index) ++ "." ++ integer_to_list(Term) ++ ".witness")). 63 | 64 | %% Location of a snapshot 65 | -define(RAFT_SNAPSHOT_PATH(Path, Name), (filename:join(Path, Name))). 66 | -define(RAFT_SNAPSHOT_PATH(Table, Partition, Name), ?RAFT_SNAPSHOT_PATH(?RAFT_PARTITION_PATH(Table, Partition), Name)). 67 | -define(RAFT_SNAPSHOT_PATH(Table, Partition, Index, Term), ?RAFT_SNAPSHOT_PATH(Table, Partition, ?SNAPSHOT_NAME(Index, Term))). 68 | 69 | %% [Transport] Atomics - field index for update timestamp 70 | -define(RAFT_TRANSPORT_ATOMICS_UPDATED_TS, 1). 71 | %% [Transport] Transport atomics - field count 72 | -define(RAFT_TRANSPORT_TRANSPORT_ATOMICS_COUNT, 1). 73 | %% [Transport] File atomics - field count 74 | -define(RAFT_TRANSPORT_FILE_ATOMICS_COUNT, 1). 75 | 76 | -define(READ_OP, '$read'). 77 | 78 | %%------------------------------------------------------------------- 79 | %% Metrics 80 | %%------------------------------------------------------------------- 81 | 82 | -define(RAFT_METRICS_MODULE_KEY, {?RAFT_APPLICATION, raft_metrics_module}). 83 | -define(RAFT_METRICS_MODULE, (persistent_term:get(?RAFT_METRICS_MODULE_KEY, wa_raft_metrics))). 84 | -define(RAFT_COUNT(Metric), ?RAFT_METRICS_MODULE:count(Metric)). 85 | -define(RAFT_COUNTV(Metric, Value), ?RAFT_METRICS_MODULE:countv(Metric, Value)). 86 | -define(RAFT_GATHER(Metric, Value), ?RAFT_METRICS_MODULE:gather(Metric, Value)). 87 | -define(RAFT_GATHER_LATENCY(Metric, Value), ?RAFT_METRICS_MODULE:gather_latency(Metric, Value)). 88 | 89 | %%------------------------------------------------------------------- 90 | %% Global Configuration 91 | %%------------------------------------------------------------------- 92 | 93 | %% Get global config 94 | -define(RAFT_CONFIG(Name), (application:get_env(?RAFT_APPLICATION, Name))). 95 | -define(RAFT_CONFIG(Name, Default), (application:get_env(?RAFT_APPLICATION, Name, Default))). 96 | 97 | %% Default metrics module 98 | -define(RAFT_METRICS_MODULE(), ?RAFT_CONFIG(raft_metrics_module)). 99 | 100 | %% Default Call timeout for all cross node gen_server:call 101 | -define(RAFT_RPC_CALL_TIMEOUT(), ?RAFT_CONFIG(raft_rpc_call_timeout, 10000)). 102 | %% Default call timeout for storage related operation (we need bigger default since storage can be slower) 103 | -define(RAFT_STORAGE_CALL_TIMEOUT(), ?RAFT_CONFIG(raft_storage_call_timeout, 60000)). 104 | 105 | %% Maximum number of concurrent catchups by bulk log transfer 106 | -define(RAFT_MAX_CONCURRENT_LOG_CATCHUP(), ?RAFT_CONFIG(raft_max_log_catchup, 5)). 107 | %% Maximum number of concurrent catchups by snapshot transfer 108 | -define(RAFT_MAX_CONCURRENT_SNAPSHOT_CATCHUP(), ?RAFT_CONFIG(raft_max_snapshot_catchup, 5)). 109 | %% Maximum number of incoming snapshots by snapshot transfer. 110 | -define(RAFT_MAX_CONCURRENT_INCOMING_SNAPSHOT_TRANSFERS(), ?RAFT_CONFIG(raft_max_incoming_snapshot_transfers, 5)). 111 | %% Maximum number of incoming witness snapshots by snapshot transfer. 112 | -define(RAFT_MAX_CONCURRENT_INCOMING_WITNESS_SNAPSHOT_TRANSFERS(), ?RAFT_CONFIG(raft_max_incoming_witness_snapshot_transfers, 10)). 113 | 114 | %% Default cross-node call timeout for heartbeats made for bulk logs catchup 115 | -define(RAFT_CATCHUP_HEARTBEAT_TIMEOUT(), ?RAFT_CONFIG(raft_catchup_rpc_timeout_ms, 5000)). 116 | 117 | %% Number of workers to use for transports 118 | -define(RAFT_TRANSPORT_THREADS(), ?RAFT_CONFIG(raft_transport_threads, 1)). 119 | %% Time in seconds after which a transport that has not made progress should be considered failed 120 | -define(RAFT_TRANSPORT_IDLE_TIMEOUT(), ?RAFT_CONFIG(transport_idle_timeout_secs, 30)). 121 | 122 | %% Maximum number of previous inactive transports to retain info for. 123 | -define(RAFT_TRANSPORT_INACTIVE_INFO_LIMIT(), ?RAFT_CONFIG(raft_transport_inactive_info_limit, 30)). 124 | 125 | %% Size in bytes of individual chunks (messages containing file data) to be sent during transports 126 | %% using the dist transport provider 127 | -define(RAFT_DIST_TRANSPORT_CHUNK_SIZE(), ?RAFT_CONFIG(dist_transport_chunk_size, 1 * 1024 * 1024)). 128 | %% Maximum number of chunks that can be sent by the dist transport provider without being 129 | %% acknowledged by the recipient 130 | -define(RAFT_DIST_TRANSPORT_MAX_INFLIGHT(), ?RAFT_CONFIG(dist_transport_max_inflight, 4)). 131 | 132 | %%------------------------------------------------------------------- 133 | %% Application-specific Configuration 134 | %%------------------------------------------------------------------- 135 | 136 | %% Get application-scoped config 137 | -define(RAFT_APP_CONFIG(App, Name, Default), (wa_raft_env:get_env(App, Name, Default))). 138 | 139 | %% Maximum number of pending applies for any single RAFT partition 140 | -define(RAFT_MAX_PENDING_APPLIES, raft_max_pending_applies). 141 | -define(RAFT_MAX_PENDING_APPLIES(App), ?RAFT_APP_CONFIG(App, {?RAFT_MAX_PENDING_APPLIES, raft_apply_queue_max_size}, 1000)). 142 | %% Maximum bytes of pending applies for any single RAFT partition 143 | -define(RAFT_MAX_PENDING_APPLY_BYTES, raft_max_pending_apply_bytes). 144 | -define(RAFT_MAX_PENDING_APPLY_BYTES(App), ?RAFT_APP_CONFIG(App, ?RAFT_MAX_PENDING_APPLY_BYTES, 32_000_000)). 145 | %% Maximum number of pending high priority commits for any single RAFT partition 146 | -define(RAFT_MAX_PENDING_HIGH_PRIORITY_COMMITS, raft_max_pending_high_priority_commits). 147 | -define(RAFT_MAX_PENDING_HIGH_PRIORITY_COMMITS(App), ?RAFT_APP_CONFIG(App, ?RAFT_MAX_PENDING_HIGH_PRIORITY_COMMITS, 1500)). 148 | %% Maximum number of pending low priority commits for any single RAFT partition 149 | -define(RAFT_MAX_PENDING_LOW_PRIORITY_COMMITS, raft_max_pending_low_priority_commits). 150 | -define(RAFT_MAX_PENDING_LOW_PRIORITY_COMMITS(App), ?RAFT_APP_CONFIG(App, ?RAFT_MAX_PENDING_LOW_PRIORITY_COMMITS, 250)). 151 | %% Maximum number of pending reads for any single RAFT partition 152 | -define(RAFT_MAX_PENDING_READS, raft_max_pending_reads). 153 | -define(RAFT_MAX_PENDING_READS(App), ?RAFT_APP_CONFIG(App, ?RAFT_MAX_PENDING_READS, 5000)). 154 | 155 | %% Whether or not this node is eligible to be leader. 156 | -define(RAFT_LEADER_ELIGIBLE, raft_leader_eligible). 157 | -define(RAFT_LEADER_ELIGIBLE(App), (?RAFT_APP_CONFIG(App, ?RAFT_LEADER_ELIGIBLE, true) =/= false)). 158 | %% Time in milliseconds during which a leader was unable to replicate heartbeats to a 159 | %% quorum of followers before considering the leader to be stale. 160 | -define(RAFT_LEADER_STALE_INTERVAL, raft_max_heartbeat_age_msecs). 161 | -define(RAFT_LEADER_STALE_INTERVAL(App), ?RAFT_APP_CONFIG(App, ?RAFT_LEADER_STALE_INTERVAL, 180 * 1000)). 162 | %% Relative "weight" at which this node will trigger elections and thereby be elected. 163 | -define(RAFT_ELECTION_WEIGHT, raft_election_weight). 164 | -define(RAFT_ELECTION_WEIGHT(App), ?RAFT_APP_CONFIG(App, ?RAFT_ELECTION_WEIGHT, ?RAFT_ELECTION_DEFAULT_WEIGHT)). 165 | %% Interval in milliseconds between heartbeats sent by RAFT leaders with no pending log entries 166 | -define(RAFT_HEARTBEAT_INTERVAL, raft_heartbeat_interval_ms). 167 | -define(RAFT_HEARTBEAT_INTERVAL(App), ?RAFT_APP_CONFIG(App, ?RAFT_HEARTBEAT_INTERVAL, 120)). 168 | %% Maximum number of log entries to include in a single heartbeat 169 | -define(RAFT_HEARTBEAT_MAX_ENTRIES, raft_max_log_entries_per_heartbeat). 170 | -define(RAFT_HEARTBEAT_MAX_ENTRIES(App), ?RAFT_APP_CONFIG(App, ?RAFT_HEARTBEAT_MAX_ENTRIES, 15)). 171 | %% Maximum bytes of log entries to include in a single heartbeat 172 | -define(RAFT_HEARTBEAT_MAX_BYTES, raft_max_heartbeat_size). 173 | -define(RAFT_HEARTBEAT_MAX_BYTES(App), ?RAFT_APP_CONFIG(App, ?RAFT_HEARTBEAT_MAX_BYTES, 1 * 1024 * 1024)). 174 | %% Time in milliseconds to wait to collect pending log entries into a single heartbeat before 175 | %% triggering a heartbeat due to having pending log entries 176 | -define(RAFT_COMMIT_BATCH_INTERVAL, raft_commit_batch_interval_ms). 177 | -define(RAFT_COMMIT_BATCH_INTERVAL(App), ?RAFT_APP_CONFIG(App, ?RAFT_COMMIT_BATCH_INTERVAL, 2)). 178 | %% Maximum number of pending log entries to collect before a heartbeat is forced. This should 179 | %% be at most equal to the maximum number of log entries permitted per heartbeat. 180 | -define(RAFT_COMMIT_BATCH_MAX_ENTRIES, raft_commit_batch_max). 181 | -define(RAFT_COMMIT_BATCH_MAX_ENTRIES(App), ?RAFT_APP_CONFIG(App, ?RAFT_COMMIT_BATCH_MAX_ENTRIES, 15)). 182 | %% Maximum number of log entries to speculatively retain in the log due to followers 183 | %% not yet reporting having replicated the log entry locally 184 | -define(RAFT_MAX_RETAINED_ENTRIES, raft_max_retained_entries). 185 | -define(RAFT_MAX_RETAINED_ENTRIES(App), ?RAFT_APP_CONFIG(App, {?RAFT_MAX_RETAINED_ENTRIES, max_log_rotate_delay}, 1500000)). 186 | 187 | %% Maximum number of log entries to queue for application by storage at once before 188 | %% continuing to process the incoming message queue on the RAFT server. 189 | -define(RAFT_MAX_CONSECUTIVE_APPLY_ENTRIES, raft_apply_log_batch_size). 190 | -define(RAFT_MAX_CONSECUTIVE_APPLY_ENTRIES(App), ?RAFT_APP_CONFIG(App, ?RAFT_MAX_CONSECUTIVE_APPLY_ENTRIES, 200)). 191 | %% Maximum bytes of log entries to queue for application by storage at once before 192 | %% continuing to process the incoming message queue on the RAFT server. 193 | -define(RAFT_MAX_CONSECUTIVE_APPLY_BYTES, raft_apply_batch_max_bytes). 194 | -define(RAFT_MAX_CONSECUTIVE_APPLY_BYTES(App), ?RAFT_APP_CONFIG(App, ?RAFT_MAX_CONSECUTIVE_APPLY_BYTES, 200 * 4 * 1024)). 195 | 196 | %% Minimum time in milliseconds since the receiving the last valid leader heartbeat 197 | %% before triggering a new election due to term timeout. This time should be much 198 | %% greater than the maximum expected network delay. 199 | -define(RAFT_ELECTION_TIMEOUT_MIN, raft_election_timeout_ms). 200 | -define(RAFT_ELECTION_TIMEOUT_MIN(App), ?RAFT_APP_CONFIG(App, ?RAFT_ELECTION_TIMEOUT_MIN, 5000)). 201 | %% Maximum time in milliseconds since the receiving the last valid leader heartbeat 202 | %% before triggering a new election due to term timeout. The difference between this 203 | %% time and the minimum election timeout should be much greater than the expected 204 | %% variance in network delay. 205 | -define(RAFT_ELECTION_TIMEOUT_MAX, raft_election_timeout_ms_max). 206 | -define(RAFT_ELECTION_TIMEOUT_MAX(App), ?RAFT_APP_CONFIG(App, ?RAFT_ELECTION_TIMEOUT_MAX, 7500)). 207 | 208 | %% The maximum time in milliseconds during which a leader can continue to be considered live without 209 | %% receiving an updated heartbeat response quorum from replicas or during which a follower or witness 210 | %% can be considered live without receiving a heartbeat from a valid leader of the current term. 211 | -define(RAFT_LIVENESS_GRACE_PERIOD_MS, raft_liveness_grace_period_ms). 212 | -define(RAFT_LIVENESS_GRACE_PERIOD_MS(App), ?RAFT_APP_CONFIG(App, ?RAFT_LIVENESS_GRACE_PERIOD_MS, 30_000)). 213 | %% The maximum number of log entries that can be not yet applied to a follower or witnesse's log 214 | %% compared to the leader's commit index before the replica is considered stale. 215 | -define(RAFT_STALE_GRACE_PERIOD_ENTRIES, raft_stale_grace_period_entries). 216 | -define(RAFT_STALE_GRACE_PERIOD_ENTRIES(App), ?RAFT_APP_CONFIG(App, ?RAFT_STALE_GRACE_PERIOD_ENTRIES, 5_000)). 217 | 218 | %% Minium amount of time in seconds since the last successfully received 219 | %% heartbeat from a leader of a term for non-forced promotion to be allowed. 220 | -define(RAFT_PROMOTION_GRACE_PERIOD, raft_promotion_grace_period_secs). 221 | -define(RAFT_PROMOTION_GRACE_PERIOD(App), ?RAFT_APP_CONFIG(App, ?RAFT_PROMOTION_GRACE_PERIOD, 60)). 222 | 223 | %% Maximum number of log entries to include in a Handover RPC to pass 224 | %% leadership to another peer. A limit is enforced to prevent a handover 225 | %% trying to send huge numbers of logs to catchup a peer during handover. 226 | -define(RAFT_HANDOVER_MAX_ENTRIES, raft_max_handover_log_entries). 227 | -define(RAFT_HANDOVER_MAX_ENTRIES(App), ?RAFT_APP_CONFIG(App, ?RAFT_HANDOVER_MAX_ENTRIES, 200)). 228 | %% Maximum number of total log entries from the leader's current log that a 229 | %% peer has not yet confirmed to be applied. This limit helps prevent nodes who 230 | %% may have already received all the current log entries but are behind in 231 | %% actually applying them to the underlying storage from becoming leader due to 232 | %% handover before they are ready. This defaults to equal to the maximum number 233 | %% of missing log entries. (See `?RAFT_HANDOVER_MAX_ENTRIES`.) 234 | -define(RAFT_HANDOVER_MAX_UNAPPLIED_ENTRIES, raft_handover_max_unapplied_entries). 235 | -define(RAFT_HANDOVER_MAX_UNAPPLIED_ENTRIES(App), ?RAFT_APP_CONFIG(App, ?RAFT_HANDOVER_MAX_UNAPPLIED_ENTRIES, undefined)). 236 | %% Maximum total byte size of log entries to include in a Handover RPC. 237 | -define(RAFT_HANDOVER_MAX_BYTES, raft_max_handover_log_size). 238 | -define(RAFT_HANDOVER_MAX_BYTES(App), ?RAFT_APP_CONFIG(App, ?RAFT_HANDOVER_MAX_BYTES, 50 * 1024 * 1024)). 239 | %% Time in milliseconds to wait before considering a previously triggered handover failed. 240 | -define(RAFT_HANDOVER_TIMEOUT, raft_handover_timeout_ms). 241 | -define(RAFT_HANDOVER_TIMEOUT(App), ?RAFT_APP_CONFIG(App, ?RAFT_HANDOVER_TIMEOUT, 600)). 242 | 243 | %% Minimum nubmer of log entries past the minimum kept by the RAFT server before triggering 244 | %% log rotation 245 | -define(RAFT_LOG_ROTATION_INTERVAL, raft_max_log_records_per_file). 246 | -define(RAFT_LOG_ROTATION_INTERVAL(App), ?RAFT_APP_CONFIG(App, ?RAFT_LOG_ROTATION_INTERVAL, 200000)). 247 | %% Maximum number of log entries past the minimum kept by the RAFT server to retain in 248 | %% the log after rotation 249 | -define(RAFT_LOG_ROTATION_KEEP, raft_max_log_records). 250 | -define(RAFT_LOG_ROTATION_KEEP(App, Interval), ?RAFT_APP_CONFIG(App, ?RAFT_LOG_ROTATION_KEEP, Interval * 10)). 251 | %% Whether log rotation should be controlled by local log length or by 252 | %% leader-announced cluster trimming index 253 | -define(RAFT_LOG_ROTATION_BY_TRIM_INDEX, raft_rotate_by_trim_index). 254 | -define(RAFT_LOG_ROTATION_BY_TRIM_INDEX(App), (?RAFT_APP_CONFIG(App, {?RAFT_LOG_ROTATION_BY_TRIM_INDEX, use_trim_index}, false) =:= true)). 255 | 256 | %% Whether or not the log should return entries in external term format 257 | %% when log entries are fetched for heartbeats 258 | -define(RAFT_LOG_HEARTBEAT_BINARY_ENTRIES, raft_log_heartbeat_binary_entries). 259 | -define(RAFT_LOG_HEARTBEAT_BINARY_ENTRIES(App), 260 | (?RAFT_APP_CONFIG(App, ?RAFT_LOG_HEARTBEAT_BINARY_ENTRIES, false) =:= true) 261 | ). 262 | 263 | %% Minimum number of log entries after which RAFT servers should use bulk logs catchup to bring peers 264 | %% back into sync if enabled. 265 | -define(RAFT_CATCHUP_BULK_LOG_THRESHOLD, raft_catchup_threshold). 266 | -define(RAFT_CATCHUP_BULK_LOG_THRESHOLD(App), ?RAFT_APP_CONFIG(App, {?RAFT_CATCHUP_BULK_LOG_THRESHOLD, catchup_max_follower_lag}, 50000)). 267 | %% Minimum number of unapplied log entries after which RAFT servers should use snapshot catchup to bring peers 268 | %% back into sync if enabled. 269 | -define(RAFT_CATCHUP_APPLY_BACKLOG_THRESHOLD, raft_catchup_apply_backlog_threshold). 270 | -define(RAFT_CATCHUP_APPLY_BACKLOG_THRESHOLD(App), ?RAFT_APP_CONFIG(App, {?RAFT_CATCHUP_APPLY_BACKLOG_THRESHOLD, catchup_max_follower_apply_backlog}, 100000)). 271 | %% Maximum log entries per heartbeat for catchup by bulk log transfer 272 | -define(RAFT_CATCHUP_MAX_ENTRIES_PER_BATCH, raft_catchup_log_batch_entries). 273 | -define(RAFT_CATCHUP_MAX_ENTRIES_PER_BATCH(App), ?RAFT_APP_CONFIG(App, ?RAFT_CATCHUP_MAX_ENTRIES_PER_BATCH, 800)). 274 | %% Maximum bytes per heartbeat for catchup by bulk log transfer 275 | -define(RAFT_CATCHUP_MAX_BYTES_PER_BATCH, raft_catchup_log_batch_bytes). 276 | -define(RAFT_CATCHUP_MAX_BYTES_PER_BATCH(App), ?RAFT_APP_CONFIG(App, ?RAFT_CATCHUP_MAX_BYTES_PER_BATCH, 4 * 1024 * 1024)). 277 | % Time to wait before retrying snapshot transport to a overloaded peer. 278 | -define(RAFT_SNAPSHOT_CATCHUP_OVERLOADED_BACKOFF_MS, snapshot_catchup_overloaded_backoff_ms). 279 | -define(RAFT_SNAPSHOT_CATCHUP_OVERLOADED_BACKOFF_MS(App), ?RAFT_APP_CONFIG(App, ?RAFT_SNAPSHOT_CATCHUP_OVERLOADED_BACKOFF_MS, 1000)). 280 | % Time to wait before allowing a rerun of a completed snapshot transport. 281 | -define(RAFT_SNAPSHOT_CATCHUP_COMPLETED_BACKOFF_MS, raft_snapshot_catchup_completed_backoff_ms). 282 | -define(RAFT_SNAPSHOT_CATCHUP_COMPLETED_BACKOFF_MS(App), ?RAFT_APP_CONFIG(App, ?RAFT_SNAPSHOT_CATCHUP_COMPLETED_BACKOFF_MS, 20 * 1000)). 283 | % Time to wait before allowing a rerun of a failed snapshot transport. 284 | -define(RAFT_SNAPSHOT_CATCHUP_FAILED_BACKOFF_MS, raft_snapshot_catchup_failed_backoff_ms). 285 | -define(RAFT_SNAPSHOT_CATCHUP_FAILED_BACKOFF_MS(App), ?RAFT_APP_CONFIG(App, ?RAFT_SNAPSHOT_CATCHUP_FAILED_BACKOFF_MS, 10 * 1000)). 286 | 287 | %% Number of omitted log entries to skip actually applying to storage when 288 | %% operating as a witness. 289 | -define(RAFT_STORAGE_WITNESS_APPLY_INTERVAL, raft_storage_witness_apply_interval). 290 | -define(RAFT_STORAGE_WITNESS_APPLY_INTERVAL(App), ?RAFT_APP_CONFIG(App, ?RAFT_STORAGE_WITNESS_APPLY_INTERVAL, 5000)). 291 | 292 | %% Whether or not the storage server should request more log entries 293 | %% when the apply queue is empty. 294 | -define(RAFT_STORAGE_NOTIFY_COMPLETE, raft_storage_notify_complete). 295 | -define(RAFT_STORAGE_NOTIFY_COMPLETE(App), (?RAFT_APP_CONFIG(App, ?RAFT_STORAGE_NOTIFY_COMPLETE, true) =:= true)). 296 | 297 | %% Time in seconds to retain transport destination directories after use 298 | -define(RAFT_TRANSPORT_RETAIN_INTERVAL, transport_retain_min_secs). 299 | -define(RAFT_TRANSPORT_RETAIN_INTERVAL(App), ?RAFT_APP_CONFIG(App, ?RAFT_TRANSPORT_RETAIN_INTERVAL, 300)). 300 | 301 | %%------------------------------------------------------------------- 302 | %% Records 303 | %%------------------------------------------------------------------- 304 | 305 | %% Log position 306 | -record(raft_log_pos, { 307 | %% log sequence number 308 | index = 0 :: wa_raft_log:log_index(), 309 | %% leader's term when log entry is created 310 | term = 0 :: wa_raft_log:log_term() 311 | }). 312 | 313 | %% Log handle. 314 | -record(raft_log, { 315 | name :: wa_raft_log:log_name(), 316 | application :: atom(), 317 | table :: wa_raft:table(), 318 | partition :: wa_raft:partition(), 319 | provider :: module() 320 | }). 321 | 322 | %% This record represents the identity of a RAFT replica, usable to 323 | %% distinguish different RAFT replicas from one another. This record 324 | %% is not guaranteed to remain structurally compatible between versions 325 | %% of RAFT and so should not be persisted between runtimes nor sent 326 | %% between RAFT servers. It is generally allowed to inspect the fields 327 | %% of this record, however, similarly, this record is subject to change 328 | %% at any time. 329 | -record(raft_identity, { 330 | % The service name (registered name) of the RAFT server that this 331 | % identity record refers to. 332 | name :: atom(), 333 | % The node that the RAFT server that this identity record refers 334 | % to is located on. 335 | node :: node() 336 | }). 337 | 338 | %% This record represents a RAFT instance identifier. 339 | -record(raft_identifier, { 340 | application :: atom(), 341 | table :: wa_raft:table(), 342 | partition :: wa_raft:partition() 343 | }). 344 | 345 | %%------------------------------------------------------------------- 346 | %% Records for registered application and partition information 347 | %%------------------------------------------------------------------- 348 | 349 | %% Information about an application that has started a RAFT supervisor. 350 | -record(raft_application, { 351 | % Application name 352 | name :: atom(), 353 | % Config search path 354 | config_search_apps :: [atom()] 355 | }). 356 | 357 | %% Normalized options produced by `wa_raft_part_sup` for passing into RAFT processes. 358 | %% Not to be created externally. 359 | -record(raft_options, { 360 | % General options 361 | application :: atom(), 362 | table :: wa_raft:table(), 363 | partition :: wa_raft:partition(), 364 | self :: #raft_identity{}, 365 | identifier :: #raft_identifier{}, 366 | database :: file:filename(), 367 | 368 | % Acceptor options 369 | acceptor_name :: atom(), 370 | 371 | % Distribution options 372 | distribution_module :: module(), 373 | 374 | % Label options 375 | label_module :: undefined | module(), 376 | 377 | % Log options 378 | log_name :: atom(), 379 | log_module :: module(), 380 | 381 | % Log catchup options 382 | log_catchup_name :: atom(), 383 | 384 | % Queue options 385 | queue_name :: atom(), 386 | queue_counters :: atomics:atomics_ref(), 387 | queue_reads :: atom(), 388 | 389 | % Server options 390 | server_name :: atom(), 391 | 392 | % Storage options 393 | storage_name :: atom(), 394 | storage_module :: module(), 395 | 396 | % Partition supervisor options 397 | supervisor_name :: atom(), 398 | 399 | % Transport options 400 | transport_cleanup_name :: atom(), 401 | transport_directory :: file:filename(), 402 | transport_module :: module() 403 | }). 404 | 405 | %%------------------------------------------------------------------- 406 | %% Internal server states 407 | %%------------------------------------------------------------------- 408 | 409 | %% Raft runtime state 410 | -record(raft_state, { 411 | %% Owning application 412 | application :: atom(), 413 | %% RAFT server name 414 | name :: atom(), 415 | %% RAFT server's cluster identity 416 | self :: #raft_identity{}, 417 | %% RAFT replica's local identifier 418 | identifier :: #raft_identifier{}, 419 | %% Table name 420 | table :: wa_raft:table(), 421 | %% Partition number 422 | partition :: wa_raft:partition(), 423 | %% Local path to partition data 424 | partition_path :: string(), 425 | 426 | %% Current view into this RAFT replica's log state 427 | log_view :: wa_raft_log:view(), 428 | %% Current queue handle 429 | queues :: wa_raft_queue:queues(), 430 | 431 | %% Active module for distribution of RPCs 432 | distribution_module :: module(), 433 | %% Active module for labeling of log entries 434 | label_module :: module() | undefined, 435 | 436 | %% Name of this RAFT replica's storage server 437 | storage :: atom(), 438 | %% Name of this RAFT replica's catchup server 439 | catchup :: atom(), 440 | 441 | %% The index of the latest log entry in the local log that is known to 442 | %% match the log entries committed by the cluster 443 | commit_index = 0 :: non_neg_integer(), 444 | %% The index of the latest log entry that has been sent to storage to be 445 | %% applied 446 | last_applied = 0 :: non_neg_integer(), 447 | 448 | %% The most recently written RAFT configuration and the index at which it 449 | %% was written if a configuration exists in storage 450 | cached_config :: undefined | {wa_raft_log:log_index(), wa_raft_server:config()}, 451 | %% [Leader] The label of the last log entry in the current log 452 | last_label :: undefined | term(), 453 | %% The timestamp (milliseconds monotonic clock) of the most recently 454 | %% received (follower) or sent (leader) heartbeat. 455 | leader_heartbeat_ts :: undefined | integer(), 456 | 457 | %% The largest RAFT term that has been observed in the cluster or reached 458 | %% by this RAFT replica 459 | current_term = 0 :: non_neg_integer(), 460 | %% The peer that this RAFT replica voted for in the current term 461 | voted_for :: undefined | node(), 462 | %% The affirmative votes for leadership this RAFT replica has received from 463 | %% the cluster in the current term 464 | votes = #{} :: #{node() => true}, 465 | %% The leader of the current RAFT term if known 466 | leader_id :: undefined | node(), 467 | 468 | %% The timestamp (milliseconds monotonic clock) that the current state of 469 | %% this RAFT replica was reached 470 | state_start_ts :: non_neg_integer(), 471 | 472 | %% [Leader] The list of pending operations in the current commit batch 473 | %% that are in queue to be appended and replicated after a short 474 | %% wait to see if multiple commits can be handled at once to 475 | %% reduce overhead 476 | pending_high = [] :: [{gen_server:from(), wa_raft_acceptor:op()}], 477 | 478 | pending_low = [] :: [{gen_server:from(), wa_raft_acceptor:op()}], 479 | 480 | %% [Leader] Whether or not a read has been accepted and is waiting for the 481 | %% leader to establish a new quorum to be handled. 482 | pending_read = false :: boolean(), 483 | %% [Leader] The queue of accepted commit requests that are waiting to be 484 | %% committed and applied for response to the client. 485 | queued = #{} :: #{wa_raft_log:log_index() => {gen_server:from(), wa_raft_acceptor:priority()}}, 486 | %% [Leader] The index of the next log entry to send in the next heartbeat 487 | %% to each peer 488 | next_indices = #{} :: #{node() => wa_raft_log:log_index()}, 489 | %% [Leader] The index of the latest log entry in each peer's log that is 490 | %% confirmed by a heartbeat response to match the local log 491 | match_indices = #{} :: #{node() => wa_raft_log:log_index()}, 492 | %% [Leader] The index of the latest log entry that has been applied to 493 | %% each peer's underlying storage state 494 | last_applied_indices = #{} :: #{node() => wa_raft_log:log_index()}, 495 | 496 | %% [Leader] The timestamp (milliseconds monotonic clock) of the last time 497 | %% each peer was sent a heartbeat 498 | last_heartbeat_ts = #{} :: #{node() => integer()}, 499 | %% [Leader] The timestamp (milliseconds monotonic clock) of the last time 500 | %% each peer responded to this RAFT replica with a heartbeat 501 | %% response 502 | heartbeat_response_ts = #{} :: #{node() => integer()}, 503 | %% [Leader] The log index of the first log entry appended to the log that 504 | %% has a log term matching the current term 505 | first_current_term_log_index = 0 :: wa_raft_log:log_index(), 506 | %% [Leader] Information about a currently pending handover of leadership to 507 | %% a peer 508 | handover :: undefined | {node(), reference(), integer()}, 509 | 510 | %% [Disabled] The reason for which this RAFT replica was disabled 511 | disable_reason :: term() 512 | }). 513 | --------------------------------------------------------------------------------