├── .gitignore ├── raft-kv ├── raft │ ├── raft_status.cpp │ ├── raft_status.h │ ├── util.h │ ├── config.cpp │ ├── unstable.h │ ├── readonly.h │ ├── util.cpp │ ├── readonly.cpp │ ├── ready.cpp │ ├── ready.h │ ├── unstable.cpp │ ├── proto.cpp │ ├── storage.h │ ├── progress.cpp │ ├── progress.h │ ├── raft_log.h │ ├── proto.h │ ├── config.h │ ├── node.h │ ├── storage.cpp │ ├── raft.h │ ├── node.cpp │ └── raft_log.cpp ├── common │ ├── random_device.cpp │ ├── random_device.h │ ├── bytebuffer.h │ ├── log.h │ ├── bytebuffer.cpp │ ├── status.h │ ├── status.cpp │ └── slice.h ├── transport │ ├── proto.h │ ├── raft_server.h │ ├── transport.h │ ├── peer.h │ ├── transport.cpp │ ├── peer.cpp │ └── raft_server.cpp ├── snap │ ├── snapshotter.h │ └── snapshotter.cpp ├── CMakeLists.txt ├── raft-kv.cpp ├── server │ ├── redis_session.h │ ├── raft_node.h │ ├── redis_store.h │ ├── redis_store.cpp │ └── redis_session.cpp └── wal │ ├── wal.h │ └── wal.cpp ├── Procfile.in ├── tests ├── test_proto.cpp ├── test_bytebuffer.cpp ├── string_match.cpp ├── CMakeLists.txt ├── test_snapshotter.cpp ├── test_wal.cpp ├── test_msgpack.cpp ├── raft_snap_test.cpp ├── test_progress.cpp └── network.hpp ├── CMakeLists.txt ├── LICENSE └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | build 3 | cmake-build-debug -------------------------------------------------------------------------------- /raft-kv/raft/raft_status.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by ljy on 2/10/19. 3 | // 4 | 5 | -------------------------------------------------------------------------------- /raft-kv/common/random_device.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace kv { 4 | uint32_t RandomDevice::gen() { 5 | return static_cast(distribution_(gen_)); 6 | } 7 | 8 | } 9 | 10 | 11 | -------------------------------------------------------------------------------- /raft-kv/raft/raft_status.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace kv { 5 | 6 | struct RaftStatus { 7 | uint64_t id; 8 | 9 | }; 10 | typedef std::shared_ptr RaftStatusPtr; 11 | 12 | } 13 | -------------------------------------------------------------------------------- /Procfile.in: -------------------------------------------------------------------------------- 1 | # Use goreman to run `go get github.com/mattn/goreman` 2 | node1: ./raft-kv/raft-kv --id 1 --cluster=127.0.0.1:12379,127.0.0.1:22379,127.0.0.1:32379 --port 63791 3 | node2: ./raft-kv/raft-kv --id 2 --cluster=127.0.0.1:12379,127.0.0.1:22379,127.0.0.1:32379 --port 63792 4 | node3: ./raft-kv/raft-kv --id 3 --cluster=127.0.0.1:12379,127.0.0.1:22379,127.0.0.1:32379 --port 63793 -------------------------------------------------------------------------------- /raft-kv/common/random_device.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace kv { 5 | 6 | class RandomDevice { 7 | public: 8 | explicit RandomDevice(uint32_t min, std::uint32_t max) 9 | : gen_(rd_()), 10 | distribution_(min, max) { 11 | 12 | } 13 | 14 | uint32_t gen(); 15 | 16 | private: 17 | std::random_device rd_; 18 | std::mt19937 gen_; 19 | std::uniform_int_distribution<> distribution_; 20 | }; 21 | 22 | } 23 | -------------------------------------------------------------------------------- /tests/test_proto.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "raft-kv/raft/proto.h" 3 | using namespace kv; 4 | 5 | int main(int argc, char* argv[]) { 6 | for (int i = 0; i <= proto::MsgPreVoteResp; ++i) { 7 | const char* str = proto::msg_type_to_string(i); 8 | fprintf(stderr, " %d, %s\n", i, str); 9 | } 10 | 11 | for (int i = 0; i < 2; ++i) { 12 | const char* str = proto::entry_type_to_string(i); 13 | fprintf(stderr, " %d, %s\n", i, str); 14 | } 15 | } -------------------------------------------------------------------------------- /raft-kv/transport/proto.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace kv { 5 | const uint8_t TransportTypeStream = 1; 6 | const uint8_t TransportTypePipeline = 3; 7 | const uint8_t TransportTypeDebug = 5; 8 | 9 | #pragma pack(1) 10 | struct TransportMeta { 11 | uint8_t type; 12 | uint32_t len; 13 | uint8_t data[0]; 14 | }; 15 | #pragma pack() 16 | 17 | #pragma pack(1) 18 | struct DebugMessage { 19 | uint32_t a; 20 | uint32_t b; 21 | }; 22 | #pragma pack() 23 | 24 | } 25 | -------------------------------------------------------------------------------- /raft-kv/raft/util.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace kv { 5 | 6 | void entry_limit_size(uint64_t max_size, std::vector& entries); 7 | 8 | // vote_resp_msg_type maps vote and prevote message types to their corresponding responses. 9 | proto::MessageType vote_resp_msg_type(proto::MessageType type); 10 | 11 | bool is_local_msg(proto::MessageType type); 12 | 13 | uint32_t compute_crc32(const char* data, size_t len); 14 | 15 | bool is_must_sync(const proto::HardState& st, const proto::HardState& prevst, size_t entsnum); 16 | 17 | } 18 | -------------------------------------------------------------------------------- /raft-kv/snap/snapshotter.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace kv { 8 | 9 | class Snapshotter { 10 | public: 11 | explicit Snapshotter(const std::string& dir) 12 | : dir_(dir) { 13 | } 14 | 15 | ~Snapshotter() = default; 16 | 17 | Status load(proto::Snapshot& snapshot); 18 | 19 | Status save_snap(const proto::Snapshot& snapshot); 20 | 21 | static std::string snap_name(uint64_t term, uint64_t index); 22 | 23 | private: 24 | void get_snap_names(std::vector& names); 25 | 26 | Status load_snap(const std::string& filename, proto::Snapshot& snapshot); 27 | 28 | private: 29 | std::string dir_; 30 | }; 31 | 32 | } -------------------------------------------------------------------------------- /raft-kv/common/bytebuffer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace kv { 8 | 9 | class ByteBuffer { 10 | public: 11 | explicit ByteBuffer(); 12 | 13 | void put(const uint8_t* data, uint32_t len); 14 | 15 | void read_bytes(uint32_t bytes); 16 | 17 | bool readable() const { 18 | return writer_ > reader_; 19 | } 20 | 21 | uint32_t readable_bytes() const; 22 | 23 | uint32_t capacity() const { 24 | return static_cast(buff_.capacity()); 25 | } 26 | 27 | const uint8_t* reader() const { 28 | return buff_.data() + reader_; 29 | } 30 | 31 | Slice slice() const { 32 | return Slice((const char*) reader(), readable_bytes()); 33 | } 34 | 35 | void reset(); 36 | private: 37 | void may_shrink_to_fit(); 38 | 39 | uint32_t reader_; 40 | uint32_t writer_; 41 | std::vector buff_; 42 | }; 43 | 44 | } 45 | -------------------------------------------------------------------------------- /raft-kv/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | set(SRC 3 | common/log.h 4 | common/slice.h 5 | common/status.cpp 6 | common/bytebuffer.cpp 7 | common/random_device.cpp 8 | raft/proto.cpp 9 | raft/config.cpp 10 | raft/raft.cpp 11 | raft/storage.cpp 12 | raft/node.cpp 13 | raft/raft_log.cpp 14 | raft/unstable.cpp 15 | raft/progress.cpp 16 | raft/readonly.cpp 17 | raft/raft_status.cpp 18 | raft/ready.cpp 19 | raft/util.cpp 20 | server/raft_node.cpp 21 | server/redis_session.cpp 22 | server/redis_store.cpp 23 | snap/snapshotter.cpp 24 | transport/proto.h 25 | transport/transport.h 26 | transport/transport.cpp 27 | transport/peer.h 28 | transport/raft_server.cpp 29 | transport/peer.cpp 30 | wal/wal.cpp) 31 | add_library(raft-kv++ ${SRC}) 32 | target_link_libraries(raft-kv++ ${LIBS}) 33 | 34 | 35 | add_executable(raft-kv raft-kv.cpp) 36 | target_link_libraries(raft-kv ${LIBS} raft-kv++) -------------------------------------------------------------------------------- /tests/test_bytebuffer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | using namespace kv; 5 | 6 | TEST(test_buffer, test_buffer) { 7 | ByteBuffer buff; 8 | 9 | buff.put((const uint8_t*) "abc", 3); 10 | ASSERT_TRUE(buff.readable()); 11 | ASSERT_TRUE(buff.readable_bytes() == 3); 12 | 13 | buff.read_bytes(1); 14 | char buffer[4096] = {0}; 15 | memcpy(buffer, buff.reader(), buff.readable_bytes()); 16 | ASSERT_TRUE(buff.readable_bytes() == 2); 17 | ASSERT_TRUE(buffer == std::string("bc")); 18 | ASSERT_TRUE(buff.slice().to_string() == "bc"); 19 | 20 | buff.read_bytes(2); 21 | ASSERT_TRUE(buff.readable_bytes() == 0); 22 | 23 | ASSERT_TRUE(buff.slice().to_string() == ""); 24 | fprintf(stderr, "%d\n", buff.capacity()); 25 | 26 | buff.put((const uint8_t*) "123456", 6); 27 | ASSERT_TRUE(buff.slice().to_string() == "123456"); 28 | } 29 | 30 | int main(int argc, char* argv[]) { 31 | testing::InitGoogleTest(&argc, argv); 32 | return RUN_ALL_TESTS(); 33 | } 34 | 35 | -------------------------------------------------------------------------------- /raft-kv/common/log.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #ifdef DEBUG 9 | #define LOG_DEBUG(format, ...) { fprintf(stderr, "DEBUG [%s:%d] " format "\n", strrchr(__FILE__, '/') + 1, __LINE__, ##__VA_ARGS__); } 10 | #else 11 | #define LOG_DEBUG(format, ...) 12 | #endif 13 | 14 | #define LOG_INFO(format, ...) do { fprintf(stderr, "INFO [%s:%d] " format "\n", strrchr(__FILE__, '/') + 1, __LINE__, ##__VA_ARGS__); } while(0) 15 | #define LOG_WARN(format, ...) do { fprintf(stderr, "WARN [%s:%d] " format "\n", strrchr(__FILE__, '/') + 1, __LINE__, ##__VA_ARGS__); } while(0) 16 | #define LOG_ERROR(format, ...) do { fprintf(stderr, "ERROR [%s:%d] " format "\n", strrchr(__FILE__, '/') + 1, __LINE__, ##__VA_ARGS__); } while(0) 17 | #define LOG_FATAL(format, ...) do \ 18 | { \ 19 | char buffer[1024]; \ 20 | snprintf(buffer, sizeof(buffer), "FATAL [%s:%d] " format "\n", strrchr(__FILE__, '/') + 1, __LINE__, ##__VA_ARGS__); \ 21 | throw std::runtime_error(buffer); \ 22 | } while(0) 23 | 24 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.5) 2 | project(raft-kv) 3 | 4 | set(CMAKE_CXX_STANDARD 11) 5 | 6 | add_definitions(-Wall) 7 | add_definitions(-DDEBUG) 8 | 9 | include_directories(${CMAKE_SOURCE_DIR}) 10 | 11 | find_package(PkgConfig REQUIRED) 12 | pkg_check_modules(dependencies 13 | glib-2.0>=2.10 REQUIRED 14 | msgpack REQUIRED 15 | hiredis REQUIRED) 16 | include_directories(${dependencies_INCLUDE_DIRS}) 17 | set(LIBS 18 | ${dependencies_LIBRARIES}) 19 | 20 | find_package(Boost REQUIRED COMPONENTS system filesystem) 21 | include_directories(${Boost_INCLUDE_DIRS}) 22 | set(LIBS 23 | ${LIBS} 24 | pthread 25 | Boost::system 26 | Boost::filesystem 27 | ) 28 | 29 | add_subdirectory(raft-kv) 30 | 31 | pkg_check_modules(GoogleTest gtest_main) 32 | if (GoogleTest_FOUND) 33 | include_directories(${GoogleTest_INCLUDE_DIRS}) 34 | set(LIBS 35 | ${LIBS} 36 | ${GoogleTest_LIBRARIES}) 37 | add_subdirectory(tests) 38 | enable_testing() 39 | endif (GoogleTest_FOUND) 40 | 41 | configure_file(Procfile.in Procfile) 42 | -------------------------------------------------------------------------------- /raft-kv/transport/raft_server.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace kv { 8 | 9 | class RaftServer { 10 | public: 11 | virtual ~RaftServer() = default; 12 | 13 | virtual void process(proto::MessagePtr msg, const std::function& callback) = 0; 14 | 15 | virtual void is_id_removed(uint64_t id, const std::function& callback) = 0; 16 | 17 | virtual void report_unreachable(uint64_t id) = 0; 18 | 19 | virtual void report_snapshot(uint64_t id, SnapshotStatus status) = 0; 20 | 21 | virtual uint64_t node_id() const = 0; 22 | }; 23 | typedef std::shared_ptr RaftServerPtr; 24 | 25 | class IoServer { 26 | public: 27 | virtual void start() = 0; 28 | virtual void stop() = 0; 29 | 30 | static std::shared_ptr create(void* io_service, 31 | const std::string& host, 32 | RaftServer* raft); 33 | }; 34 | typedef std::shared_ptr IoServerPtr; 35 | 36 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 lijunyun 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /raft-kv/common/bytebuffer.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | 5 | namespace kv { 6 | static uint32_t MIN_BUFFERING = 4096; 7 | 8 | ByteBuffer::ByteBuffer() 9 | : reader_(0), 10 | writer_(0), 11 | buff_(MIN_BUFFERING) { 12 | 13 | } 14 | 15 | void ByteBuffer::put(const uint8_t* data, uint32_t len) { 16 | uint32_t left = static_cast(buff_.size()) - writer_; 17 | if (left < len) { 18 | buff_.resize(buff_.size() * 2 + len, 0); 19 | } 20 | memcpy(buff_.data() + writer_, data, len); 21 | writer_ += len; 22 | } 23 | 24 | uint32_t ByteBuffer::readable_bytes() const { 25 | assert(writer_ >= reader_); 26 | return writer_ - reader_; 27 | } 28 | 29 | void ByteBuffer::read_bytes(uint32_t bytes) { 30 | assert(readable_bytes() >= bytes); 31 | reader_ += bytes; 32 | may_shrink_to_fit(); 33 | } 34 | 35 | void ByteBuffer::may_shrink_to_fit() { 36 | if (reader_ == writer_) { 37 | reader_ = 0; 38 | writer_ = 0; 39 | } 40 | } 41 | 42 | void ByteBuffer::reset() { 43 | reader_ = writer_ = 0; 44 | buff_.resize(MIN_BUFFERING); 45 | buff_.shrink_to_fit(); 46 | } 47 | 48 | } 49 | 50 | -------------------------------------------------------------------------------- /raft-kv/transport/transport.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | namespace kv { 14 | 15 | class Transport { 16 | public: 17 | virtual ~Transport() = default; 18 | 19 | virtual void start(const std::string& host) = 0; 20 | 21 | virtual void stop() = 0; 22 | 23 | // sends out the given messages to the remote peers. 24 | // Each message has a To field, which is an id that maps 25 | // to an existing peer in the transport. 26 | // If the id cannot be found in the transport, the message 27 | // will be ignored. 28 | virtual void send(std::vector msgs) = 0; 29 | 30 | virtual void add_peer(uint64_t id, const std::string& peer) = 0; 31 | 32 | virtual void remove_peer(uint64_t id) = 0; 33 | 34 | static std::shared_ptr create(RaftServer* raft, uint64_t id); 35 | }; 36 | typedef std::shared_ptr TransporterPtr; 37 | 38 | } 39 | -------------------------------------------------------------------------------- /tests/string_match.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | using namespace kv; 5 | 6 | TEST(match, match) { 7 | struct Test { 8 | std::string pattern; 9 | std::string key; 10 | int match; 11 | }; 12 | 13 | std::vector tests; 14 | tests.push_back(Test{.pattern = "*", .key = "abc", .match = 1}); 15 | tests.push_back(Test{.pattern = "a*c", .key = "abc", .match = 1}); 16 | tests.push_back(Test{.pattern = "a*", .key = "abc", .match = 1}); 17 | tests.push_back(Test{.pattern = "b*", .key = "abc", .match = 0}); 18 | tests.push_back(Test{.pattern = "*c", .key = "abc", .match = 1}); 19 | tests.push_back(Test{.pattern = "*a", .key = "abc", .match = 0}); 20 | tests.push_back(Test{.pattern = "*b*", .key = "abc", .match = 1}); 21 | tests.push_back(Test{.pattern = "", .key = "abc", .match = 0}); 22 | 23 | for (Test& t : tests) { 24 | int match = string_match_len(t.pattern.data(), t.pattern.size(), t.key.data(), t.key.size(), 0); 25 | ASSERT_TRUE(match == t.match); 26 | 27 | } 28 | } 29 | 30 | int main(int argc, char* argv[]) { 31 | testing::InitGoogleTest(&argc, argv); 32 | return RUN_ALL_TESTS(); 33 | } 34 | -------------------------------------------------------------------------------- /raft-kv/transport/peer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | namespace kv { 6 | 7 | class Peer { 8 | public: 9 | virtual ~Peer() = default; 10 | 11 | virtual void start() = 0; 12 | 13 | // send sends the message to the remote peer. The function is non-blocking 14 | // and has no promise that the message will be received by the remote. 15 | // When it fails to send message out, it will report the status to underlying 16 | // raft. 17 | virtual void send(proto::MessagePtr msg) = 0; 18 | 19 | // sendSnap sends the merged snapshot message to the remote peer. Its behavior 20 | // is similar to send. 21 | virtual void send_snap(proto::SnapshotPtr snap) = 0; 22 | 23 | // update updates the urls of remote peer. 24 | virtual void update(const std::string& peer) = 0; 25 | 26 | // activeSince returns the time that the connection with the 27 | // peer becomes active. 28 | virtual uint64_t active_since() = 0; 29 | 30 | // stop performs any necessary finalization and terminates the peer 31 | // elegantly 32 | virtual void stop() = 0; 33 | 34 | static std::shared_ptr creat(uint64_t peer, const std::string& peer_str, void* io_service); 35 | }; 36 | typedef std::shared_ptr PeerPtr; 37 | 38 | } -------------------------------------------------------------------------------- /raft-kv/raft-kv.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | static uint64_t g_id = 0; 8 | static const char* g_cluster = NULL; 9 | static uint16_t g_port = 0; 10 | 11 | int main(int argc, char* argv[]) { 12 | GOptionEntry entries[] = { 13 | {"id", 'i', 0, G_OPTION_ARG_INT64, &g_id, "node id", NULL}, 14 | {"cluster", 'c', 0, G_OPTION_ARG_STRING, &g_cluster, "comma separated cluster peers", NULL}, 15 | {"port", 'p', 0, G_OPTION_ARG_INT, &g_port, "key-value server port", NULL}, 16 | {NULL} 17 | }; 18 | 19 | GError* error = NULL; 20 | GOptionContext* context = g_option_context_new("usage"); 21 | g_option_context_add_main_entries(context, entries, NULL); 22 | if (!g_option_context_parse(context, &argc, &argv, &error)) { 23 | fprintf(stderr, "option parsing failed: %s\n", error->message); 24 | exit(EXIT_FAILURE); 25 | } 26 | fprintf(stderr, "id:%lu, port:%d, cluster:%s\n", g_id, g_port, g_cluster); 27 | 28 | if (g_id == 0 || g_port == 0) { 29 | char* help = g_option_context_get_help(context, true, NULL); 30 | fprintf(stderr, help); 31 | free(help); 32 | exit(EXIT_FAILURE); 33 | } 34 | 35 | kv::RaftNode::main(g_id, g_cluster, g_port); 36 | g_option_context_free(context); 37 | } 38 | -------------------------------------------------------------------------------- /raft-kv/raft/config.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace kv { 4 | 5 | Status Config::validate() { 6 | if (this->id == 0) { 7 | return Status::invalid_argument("cannot use none as id"); 8 | } 9 | 10 | if (this->heartbeat_tick <= 0) { 11 | return Status::invalid_argument("heartbeat tick must be greater than 0"); 12 | } 13 | 14 | if (this->election_tick <= this->heartbeat_tick) { 15 | return Status::invalid_argument("election tick must be greater than heartbeat tick"); 16 | } 17 | 18 | if (!this->storage) { 19 | return Status::invalid_argument("storage cannot be nil"); 20 | } 21 | 22 | if (this->max_uncommitted_entries_size == 0) { 23 | this->max_uncommitted_entries_size = std::numeric_limits::max(); 24 | } 25 | 26 | // default max_committed_size_per_ready to max_size_per_msg because they were 27 | // previously the same parameter. 28 | if (this->max_committed_size_per_ready == 0) { 29 | max_committed_size_per_ready = this->max_size_per_msg; 30 | } 31 | 32 | if (this->max_inflight_msgs <= 0) { 33 | return Status::invalid_argument("max inflight messages must be greater than 0"); 34 | } 35 | 36 | if (this->read_only_option == ReadOnlyLeaseBased && !this->check_quorum) { 37 | return Status::invalid_argument("check_quorum must be enabled when read_only_option is ReadOnlyLeaseBased"); 38 | } 39 | 40 | return Status::ok(); 41 | 42 | } 43 | 44 | } -------------------------------------------------------------------------------- /raft-kv/server/redis_session.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace kv { 8 | 9 | class RedisStore; 10 | class RedisSession : public std::enable_shared_from_this { 11 | public: 12 | explicit RedisSession(RedisStore* server, boost::asio::io_service& io_service); 13 | 14 | ~RedisSession() { 15 | redisReaderFree(reader_); 16 | } 17 | 18 | void start(); 19 | 20 | void handle_read(size_t bytes); 21 | 22 | void on_redis_reply(struct redisReply* reply); 23 | 24 | void send_reply(const char* data, uint32_t len); 25 | 26 | void start_send(); 27 | 28 | static void ping_command(std::shared_ptr self, struct redisReply* reply); 29 | 30 | static void get_command(std::shared_ptr self, struct redisReply* reply); 31 | 32 | static void set_command(std::shared_ptr self, struct redisReply* reply); 33 | 34 | static void del_command(std::shared_ptr self, struct redisReply* reply); 35 | 36 | static void keys_command(std::shared_ptr self, struct redisReply* reply); 37 | public: 38 | bool quit_; 39 | RedisStore* server_; 40 | boost::asio::ip::tcp::socket socket_; 41 | std::vector read_buffer_; 42 | redisReader* reader_; 43 | ByteBuffer send_buffer_; 44 | }; 45 | typedef std::shared_ptr RedisSessionPtr; 46 | 47 | } -------------------------------------------------------------------------------- /raft-kv/raft/unstable.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace kv { 5 | 6 | // Unstable.entries[i] has raft log position i+unstable.offset. 7 | // Note that unstable.offset may be less than the highest log 8 | // position in storage; this means that the next write to storage 9 | // might need to truncate the log before persisting unstable.entries. 10 | class Unstable { 11 | public: 12 | explicit Unstable(uint64_t offset) 13 | : offset_(offset) { 14 | 15 | } 16 | 17 | // maybe_first_index returns the index of the first possible entry in entries 18 | // if it has a snapshot. 19 | void maybe_first_index(uint64_t& index, bool& ok); 20 | 21 | // maybe_last_index returns the last index if it has at least one 22 | // unstable entry or snapshot. 23 | void maybe_last_index(uint64_t& index, bool& ok); 24 | 25 | // maybe_term returns the term of the entry at index i, if there 26 | // is any. 27 | void maybe_term(uint64_t index, uint64_t& term, bool& ok); 28 | 29 | void stable_to(uint64_t index, uint64_t term); 30 | 31 | void stable_snap_to(uint64_t index); 32 | 33 | void restore(proto::SnapshotPtr snapshot); 34 | 35 | void truncate_and_append(std::vector entries); 36 | 37 | void slice(uint64_t low, uint64_t high, std::vector& entries); 38 | public: 39 | // the incoming unstable snapshot, if any. 40 | proto::SnapshotPtr snapshot_; 41 | // all entries that have not yet been written to storage. 42 | 43 | std::vector entries_; 44 | uint64_t offset_; 45 | }; 46 | typedef std::shared_ptr UnstablePtr; 47 | 48 | } 49 | -------------------------------------------------------------------------------- /raft-kv/raft/readonly.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | 6 | namespace kv { 7 | 8 | // ReadState provides state for read only query. 9 | // It's caller's responsibility to call read_index first before getting 10 | // this state from ready, it's also caller's duty to differentiate if this 11 | // state is what it requests through RequestCtx, eg. given a unique id as 12 | // request_ctx 13 | struct ReadState { 14 | bool equal(const ReadState& rs) const { 15 | if (index != rs.index) { 16 | return false; 17 | } 18 | return request_ctx == rs.request_ctx; 19 | } 20 | uint64_t index; 21 | std::vector request_ctx; 22 | }; 23 | 24 | struct ReadIndexStatus { 25 | proto::Message req; 26 | uint64_t index; 27 | std::unordered_set acks; 28 | 29 | }; 30 | typedef std::shared_ptr ReadIndexStatusPtr; 31 | 32 | struct ReadOnly { 33 | explicit ReadOnly(ReadOnlyOption option) 34 | : option(option) {} 35 | 36 | // last_pending_request_ctx returns the context of the last pending read only 37 | // request in readonly struct. 38 | void last_pending_request_ctx(std::vector& ctx); 39 | 40 | uint32_t recv_ack(const proto::Message& msg); 41 | 42 | std::vector advance(const proto::Message& msg); 43 | 44 | void add_request(uint64_t index, proto::MessagePtr msg); 45 | 46 | ReadOnlyOption option; 47 | std::unordered_map pending_read_index; 48 | std::vector read_index_queue; 49 | }; 50 | typedef std::shared_ptr ReadOnlyPtr; 51 | 52 | } 53 | -------------------------------------------------------------------------------- /raft-kv/common/status.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace kv { 5 | 6 | class Status { 7 | public: 8 | // Create a success status. 9 | Status() 10 | : status_(nullptr) {} 11 | 12 | Status(const Status& s); 13 | 14 | Status& operator=(const Status& s); 15 | 16 | ~Status(); 17 | 18 | static Status ok() { return Status(); } 19 | 20 | static Status not_found(const char* msg) { return Status(NotFound, msg); } 21 | 22 | static Status not_supported(const char* msg) { return Status(NotSupported, msg); } 23 | 24 | static Status invalid_argument(const char* msg) { return Status(InvalidArgument, msg); } 25 | 26 | static Status io_error(const char* msg) { return Status(IOError, msg); } 27 | 28 | bool is_ok() const { return status_ == nullptr; } 29 | 30 | bool is_not_found() const { return code() == Code::NotFound; } 31 | 32 | bool is_io_error() const { return code() == Code::IOError; } 33 | 34 | bool is_not_supported() const { return code() == NotSupported; } 35 | 36 | bool is_invalid_argument() const { return code() == InvalidArgument; } 37 | 38 | std::string to_string() const; 39 | 40 | private: 41 | 42 | enum Code { 43 | OK = 0, 44 | NotFound = 1, 45 | NotSupported = 2, 46 | InvalidArgument = 3, 47 | IOError = 4, 48 | }; 49 | 50 | inline static char* copy(const Status& s); 51 | 52 | Status(Code code, const char* msg); 53 | 54 | Code code() const { 55 | return status_ == nullptr ? Code::OK : static_cast (status_[4]); 56 | } 57 | 58 | private: 59 | //state_[0..3] == length of message 60 | //state_[4] == code 61 | //state_[5..] == message 62 | char* status_; 63 | }; 64 | 65 | } 66 | 67 | -------------------------------------------------------------------------------- /raft-kv/raft/util.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | namespace kv { 6 | 7 | void entry_limit_size(uint64_t max_size, std::vector& entries) { 8 | if (entries.empty()) { 9 | return; 10 | } 11 | 12 | uint64_t size = entries[0]->serialize_size(); 13 | for (size_t limit = 1; limit < entries.size(); ++limit) { 14 | size += entries[limit]->serialize_size(); 15 | if (size > max_size) { 16 | entries.resize(limit); 17 | break; 18 | } 19 | } 20 | } 21 | 22 | proto::MessageType vote_resp_msg_type(proto::MessageType type) { 23 | switch (type) { 24 | case proto::MsgVote: { 25 | return proto::MsgVoteResp; 26 | } 27 | case proto::MsgPreVote: { 28 | return proto::MsgPreVoteResp; 29 | } 30 | default: { 31 | LOG_FATAL("not a vote message: %s", proto::msg_type_to_string(type)); 32 | } 33 | } 34 | } 35 | 36 | bool is_local_msg(proto::MessageType type) { 37 | return type == proto::MsgHup || type == proto::MsgBeat || type == proto::MsgUnreachable || 38 | type == proto::MsgSnapStatus || type == proto::MsgCheckQuorum; 39 | } 40 | 41 | uint32_t compute_crc32(const char* data, size_t len) { 42 | boost::crc_32_type crc32; 43 | crc32.process_bytes(data, len); 44 | return crc32(); 45 | } 46 | 47 | // must_sync returns true if the hard state and count of Raft entries indicate 48 | // that a synchronous write to persistent storage is required. 49 | bool is_must_sync(const proto::HardState& st, const proto::HardState& prevst, size_t entsnum) { 50 | // Persistent state on all servers: 51 | // (Updated on stable storage before responding to RPCs) 52 | // currentTerm 53 | // votedFor 54 | // log entries[] 55 | return entsnum != 0 || st.vote != prevst.vote || st.term != prevst.term; 56 | } 57 | 58 | 59 | } 60 | 61 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | set(LIBS 3 | ${LIBS} 4 | raft-kv++) 5 | 6 | enable_testing() 7 | find_package(GTest REQUIRED) 8 | 9 | add_executable(test_msgpack test_msgpack.cpp) 10 | target_link_libraries(test_msgpack ${LIBS}) 11 | gtest_add_tests(TARGET test_msgpack) 12 | 13 | add_executable(test_bytebuffer test_bytebuffer.cpp) 14 | target_link_libraries(test_bytebuffer ${LIBS}) 15 | gtest_add_tests(TARGET test_bytebuffer) 16 | 17 | add_executable(test_proto test_proto.cpp) 18 | target_link_libraries(test_proto ${LIBS}) 19 | gtest_add_tests(TARGET test_proto) 20 | 21 | add_executable(test_storage test_storage.cpp) 22 | target_link_libraries(test_storage ${LIBS}) 23 | gtest_add_tests(TARGET test_storage) 24 | 25 | add_executable(test_unstable test_unstable.cpp) 26 | target_link_libraries(test_unstable ${LIBS}) 27 | gtest_add_tests(TARGET test_unstable) 28 | 29 | add_executable(test_raftlog test_raftlog.cpp) 30 | target_link_libraries(test_raftlog ${LIBS}) 31 | gtest_add_tests(TARGET test_raftlog) 32 | 33 | add_executable(test_progress test_progress.cpp) 34 | target_link_libraries(test_progress ${LIBS}) 35 | gtest_add_tests(TARGET test_progress) 36 | 37 | add_executable(test_raft test_raft.cpp network.hpp) 38 | target_link_libraries(test_raft ${LIBS}) 39 | gtest_add_tests(TARGET test_raft) 40 | 41 | add_executable(test_rawnode test_rawnode.cpp) 42 | target_link_libraries(test_rawnode ${LIBS}) 43 | gtest_add_tests(TARGET test_rawnode) 44 | 45 | add_executable(string_match string_match.cpp) 46 | target_link_libraries(string_match ${LIBS}) 47 | gtest_add_tests(TARGET string_match) 48 | 49 | add_executable(raft_snap_test raft_snap_test.cpp network.hpp) 50 | target_link_libraries(raft_snap_test ${LIBS}) 51 | gtest_add_tests(TARGET raft_snap_test) 52 | 53 | add_executable(test_snapshotter test_snapshotter.cpp) 54 | target_link_libraries(test_snapshotter ${LIBS}) 55 | gtest_add_tests(TARGET test_snapshotter) 56 | 57 | add_executable(test_wal test_wal.cpp) 58 | target_link_libraries(test_wal ${LIBS}) 59 | gtest_add_tests(TARGET test_wal) -------------------------------------------------------------------------------- /raft-kv/common/status.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace kv { 8 | 9 | Status::~Status() { 10 | if (status_) { 11 | free(status_); 12 | } 13 | } 14 | 15 | Status::Status(const Status& s) { 16 | status_ = copy(s); 17 | } 18 | 19 | Status& Status::operator=(const Status& s) { 20 | if (status_ != nullptr) { 21 | free(status_); 22 | } 23 | status_ = copy(s); 24 | return *this; 25 | } 26 | 27 | char* Status::copy(const Status& s) { 28 | if (s.status_ == nullptr) { 29 | return nullptr; 30 | } else { 31 | uint32_t len; 32 | memcpy(&len, s.status_, sizeof(uint32_t)); 33 | 34 | char* status = (char*) malloc(len + 5); 35 | memcpy(status, s.status_, len + 5); 36 | return status; 37 | } 38 | } 39 | 40 | std::string Status::to_string() const { 41 | if (is_ok()) { 42 | return "ok"; 43 | } 44 | 45 | const char* str; 46 | char tmp[30]; 47 | Code c = code(); 48 | switch (c) { 49 | case Code::OK :str = "ok"; 50 | break; 51 | case Code::NotFound :str = "not found:"; 52 | break; 53 | case Code::NotSupported :str = "not supported:"; 54 | break; 55 | case Code::InvalidArgument :str = "invalid argument:"; 56 | break; 57 | case Code::IOError :str = "io error:"; 58 | break; 59 | default: { 60 | snprintf(tmp, sizeof(tmp), "Unknown code(%d):", c); 61 | str = tmp; 62 | } 63 | } 64 | 65 | std::string ret(str); 66 | uint32_t length; 67 | memcpy(&length, status_, sizeof(length)); 68 | if (length > 0) { 69 | ret.append(status_ + 5, length); 70 | 71 | } else { 72 | ret.pop_back(); 73 | } 74 | 75 | return ret; 76 | 77 | } 78 | 79 | Status::Status(Code code, const char* msg) { 80 | uint32_t len; 81 | if (msg == nullptr) { 82 | len = 0; 83 | } else { 84 | len = strlen(msg); 85 | } 86 | status_ = (char*) malloc(len + 5); 87 | memcpy(status_, &len, sizeof(uint32_t)); 88 | status_[4] = code; 89 | memcpy(status_ + 5, msg, len); 90 | } 91 | 92 | } 93 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # raft-kv 2 | 3 | A C++ implementation of RAFT consensus algorithm based on [asio](https://github.com/boostorg/asio). It provides a key-value store compatible with the [redis protocol](https://redis.io/topics/protocol). 4 | 5 | ## Getting Started 6 | 7 | ### Build 8 | 9 | git clone https://github.com/jinyyu/raft-kv.git 10 | mkdir -p raft-kv/build 11 | cd raft-kv/build 12 | cmake .. -DCMAKE_BUILD_TYPE=Release 13 | make -j8 14 | 15 | ### Running a cluster 16 | 17 | First install [goreman](https://github.com/mattn/goreman), which manages Procfile-based applications. 18 | 19 | goreman start 20 | 21 | 22 | ### Test 23 | 24 | install [redis-cli](https://github.com/antirez/redis), a redis console client. 25 | 26 | redis-cli -p 63791 27 | 127.0.0.1:63791> set mykey myvalue 28 | OK 29 | 127.0.0.1:63791> get mykey 30 | "myvalue" 31 | 32 | remove a node and replace the myvalue with "new-value" to check cluster availability: 33 | 34 | goreman run stop node2 35 | redis-cli -p 63791 36 | 127.0.0.1:63791> set mykey new-value 37 | OK 38 | 39 | bring the node back up and verify it recovers with the updated value "new-value": 40 | 41 | redis-cli -p 63792 42 | 127.0.0.1:63792> KEYS * 43 | 1) "mykey" 44 | 127.0.0.1:63792> get mykey 45 | "new-value" 46 | 47 | ### benchmark 48 | 49 | redis-benchmark -t set,get -n 100000 -p 63791 50 | 51 | ====== SET ====== 52 | 100000 requests completed in 1.35 seconds 53 | 50 parallel clients 54 | 3 bytes payload 55 | keep alive: 1 56 | 57 | 96.64% <= 1 milliseconds 58 | 99.15% <= 2 milliseconds 59 | 99.90% <= 3 milliseconds 60 | 100.00% <= 3 milliseconds 61 | 73909.83 requests per second 62 | 63 | ====== GET ====== 64 | 100000 requests completed in 0.95 seconds 65 | 50 parallel clients 66 | 3 bytes payload 67 | keep alive: 1 68 | 69 | 99.95% <= 4 milliseconds 70 | 100.00% <= 4 milliseconds 71 | 105485.23 requests per second 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /raft-kv/raft/readonly.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | namespace kv { 5 | 6 | void ReadOnly::last_pending_request_ctx(std::vector& ctx) { 7 | if (read_index_queue.empty()) { 8 | return; 9 | } 10 | 11 | ctx.insert(ctx.end(), read_index_queue.back().begin(), read_index_queue.back().end()); 12 | } 13 | 14 | uint32_t ReadOnly::recv_ack(const proto::Message& msg) { 15 | std::string str(msg.context.begin(), msg.context.end()); 16 | 17 | auto it = pending_read_index.find(str); 18 | if (it == pending_read_index.end()) { 19 | return 0; 20 | } 21 | 22 | it->second->acks.insert(msg.from); 23 | // add one to include an ack from local node 24 | return it->second->acks.size() + 1; 25 | } 26 | 27 | std::vector ReadOnly::advance(const proto::Message& msg) { 28 | std::vector rss; 29 | 30 | std::string ctx(msg.context.begin(), msg.context.end()); 31 | bool found = false; 32 | uint32_t i = 0; 33 | for (std::string& okctx: read_index_queue) { 34 | i++; 35 | auto it = pending_read_index.find(okctx); 36 | if (it == pending_read_index.end()) { 37 | LOG_FATAL("cannot find corresponding read state from pending map"); 38 | } 39 | 40 | rss.push_back(it->second); 41 | if (okctx == ctx) { 42 | found = true; 43 | break; 44 | } 45 | 46 | } 47 | 48 | if (found) { 49 | read_index_queue.erase(read_index_queue.begin(), read_index_queue.begin() + i); 50 | for (ReadIndexStatusPtr& rs : rss) { 51 | std::string str(rs->req.entries[0].data.begin(), rs->req.entries[0].data.end()); 52 | pending_read_index.erase(str); 53 | } 54 | } 55 | return rss; 56 | } 57 | 58 | void ReadOnly::add_request(uint64_t index, proto::MessagePtr msg) { 59 | std::string ctx(msg->entries[0].data.begin(), msg->entries[0].data.end()); 60 | auto it = pending_read_index.find(ctx); 61 | if (it != pending_read_index.end()) { 62 | return; 63 | } 64 | ReadIndexStatusPtr status(new ReadIndexStatus()); 65 | status->index = index; 66 | status->req = *msg; 67 | pending_read_index[ctx] = status; 68 | read_index_queue.push_back(ctx); 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /raft-kv/server/raft_node.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace kv { 13 | 14 | class RaftNode : public RaftServer { 15 | public: 16 | static void main(uint64_t id, const std::string& cluster, uint16_t port); 17 | 18 | explicit RaftNode(uint64_t id, const std::string& cluster, uint16_t port); 19 | 20 | ~RaftNode() final; 21 | 22 | void stop(); 23 | 24 | void propose(std::shared_ptr> data, const StatusCallback& callback); 25 | 26 | void process(proto::MessagePtr msg, const StatusCallback& callback) final; 27 | 28 | void is_id_removed(uint64_t id, const std::function& callback) final; 29 | 30 | void report_unreachable(uint64_t id) final; 31 | 32 | void report_snapshot(uint64_t id, SnapshotStatus status) final; 33 | 34 | uint64_t node_id() const final { return id_; } 35 | 36 | bool publish_entries(const std::vector& entries); 37 | void entries_to_apply(const std::vector& entries, std::vector& ents); 38 | void maybe_trigger_snapshot(); 39 | 40 | private: 41 | void start_timer(); 42 | void pull_ready_events(); 43 | Status save_snap(const proto::Snapshot& snap); 44 | void publish_snapshot(const proto::Snapshot& snap); 45 | 46 | // replay_WAL replays WAL entries into the raft instance. 47 | void replay_WAL(); 48 | // open_WAL opens a WAL ready for reading. 49 | void open_WAL(const proto::Snapshot& snap); 50 | 51 | void schedule(); 52 | 53 | uint16_t port_; 54 | pthread_t pthread_id_; 55 | boost::asio::io_service io_service_; 56 | boost::asio::deadline_timer timer_; 57 | uint64_t id_; 58 | std::vector peers_; 59 | uint64_t last_index_; 60 | proto::ConfStatePtr conf_state_; 61 | uint64_t snapshot_index_; 62 | uint64_t applied_index_; 63 | 64 | MemoryStoragePtr storage_; 65 | std::unique_ptr node_; 66 | TransporterPtr transport_; 67 | std::shared_ptr redis_server_; 68 | 69 | std::vector snap_data_; 70 | std::string snap_dir_; 71 | uint64_t snap_count_; 72 | std::unique_ptr snapshotter_; 73 | 74 | std::string wal_dir_; 75 | WAL_ptr wal_; 76 | }; 77 | typedef std::shared_ptr RaftNodePtr; 78 | 79 | } -------------------------------------------------------------------------------- /raft-kv/server/redis_store.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | namespace kv { 11 | 12 | int string_match_len(const char* pattern, int patternLen, 13 | const char* string, int stringLen, int nocase); 14 | 15 | struct RedisCommitData { 16 | static const uint8_t kCommitSet = 0; 17 | static const uint8_t kCommitDel = 1; 18 | 19 | uint8_t type; 20 | std::vector strs; 21 | MSGPACK_DEFINE (type, strs); 22 | }; 23 | 24 | struct RaftCommit { 25 | 26 | RaftCommit() {} 27 | 28 | uint32_t node_id; 29 | uint32_t commit_id; 30 | RedisCommitData redis_data; 31 | MSGPACK_DEFINE (node_id, commit_id, redis_data); 32 | }; 33 | 34 | typedef std::function StatusCallback; 35 | typedef std::shared_ptr> SnapshotDataPtr; 36 | typedef std::function GetSnapshotCallback; 37 | 38 | class RaftNode; 39 | class RedisStore { 40 | public: 41 | explicit RedisStore(RaftNode* server, std::vector snap, uint16_t port); 42 | 43 | ~RedisStore(); 44 | 45 | void stop() { 46 | io_service_.stop(); 47 | if (worker_.joinable()) { 48 | worker_.join(); 49 | } 50 | } 51 | 52 | void start(std::promise& promise); 53 | 54 | bool get(const std::string& key, std::string& value) { 55 | auto it = key_values_.find(key); 56 | if (it != key_values_.end()) { 57 | value = it->second; 58 | return true; 59 | } else { 60 | return false; 61 | } 62 | } 63 | 64 | void set(std::string key, std::string value, const StatusCallback& callback); 65 | 66 | void del(std::vector keys, const StatusCallback& callback); 67 | 68 | void get_snapshot(const GetSnapshotCallback& callback); 69 | 70 | void recover_from_snapshot(SnapshotDataPtr snap, const StatusCallback& callback); 71 | 72 | void keys(const char* pattern, int len, std::vector& keys); 73 | 74 | void read_commit(proto::EntryPtr entry); 75 | 76 | private: 77 | void start_accept(); 78 | 79 | RaftNode* server_; 80 | boost::asio::io_service io_service_; 81 | boost::asio::ip::tcp::acceptor acceptor_; 82 | std::thread worker_; 83 | std::unordered_map key_values_; 84 | uint32_t next_request_id_; 85 | std::unordered_map pending_requests_; 86 | }; 87 | 88 | } 89 | -------------------------------------------------------------------------------- /tests/test_snapshotter.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace kv; 9 | 10 | static std::string get_tmp_snapshot_dir() { 11 | char buffer[128]; 12 | snprintf(buffer, sizeof(buffer), "_test_snapshot/%d_%d", (int) time(NULL), getpid()); 13 | return buffer; 14 | } 15 | 16 | proto::Snapshot& get_test_snap() { 17 | static proto::SnapshotPtr snap; 18 | if (!snap) { 19 | snap = std::make_shared(); 20 | snap->metadata.index = 1; 21 | snap->metadata.term = 1; 22 | snap->data.push_back('t'); 23 | snap->data.push_back('e'); 24 | snap->data.push_back('s'); 25 | snap->data.push_back('t'); 26 | 27 | snap->metadata.conf_state.nodes.push_back(1); 28 | snap->metadata.conf_state.nodes.push_back(2); 29 | snap->metadata.conf_state.nodes.push_back(3); 30 | } 31 | return *snap; 32 | } 33 | 34 | TEST(snap, SaveAndLoad) { 35 | std::string dir = get_tmp_snapshot_dir(); 36 | boost::filesystem::create_directories(dir); 37 | Snapshotter snap(dir); 38 | proto::Snapshot& s = get_test_snap(); 39 | Status status = snap.save_snap(s); 40 | ASSERT_TRUE(status.is_ok()); 41 | 42 | proto::Snapshot snapshot; 43 | status = snap.load(snapshot); 44 | 45 | ASSERT_TRUE(status.is_ok()); 46 | ASSERT_TRUE(s.equal(snapshot)); 47 | } 48 | 49 | TEST(snap, Failback) { 50 | std::string dir = get_tmp_snapshot_dir(); 51 | boost::filesystem::create_directories(dir); 52 | 53 | char tmp[256]; 54 | snprintf(tmp, sizeof(tmp), "%s/%s", dir.c_str(), Snapshotter::snap_name(0xFFFF, 0xFFFF).c_str()); 55 | FILE* fp = fopen(tmp, "w"); 56 | fwrite("somedata", 1, 5, fp); 57 | fclose(fp); 58 | 59 | Snapshotter snap(dir); 60 | proto::Snapshot& s = get_test_snap(); 61 | Status status = snap.save_snap(s); 62 | ASSERT_TRUE(status.is_ok()); 63 | 64 | proto::Snapshot snapshot; 65 | status = snap.load(snapshot); 66 | 67 | ASSERT_TRUE(status.is_ok()); 68 | ASSERT_TRUE(s.equal(snapshot)); 69 | std::string broken = std::string(tmp) + ".broken"; 70 | ASSERT_TRUE(boost::filesystem::exists(broken)); 71 | } 72 | 73 | int main(int argc, char* argv[]) { 74 | boost::system::error_code code; 75 | boost::filesystem::create_directories("_test_snapshot"); 76 | 77 | testing::InitGoogleTest(&argc, argv); 78 | int ret = RUN_ALL_TESTS(); 79 | 80 | boost::filesystem::remove_all("_test_snapshot", code); 81 | return ret; 82 | } 83 | 84 | -------------------------------------------------------------------------------- /raft-kv/transport/transport.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | namespace kv { 7 | 8 | class TransportImpl : public Transport { 9 | 10 | public: 11 | explicit TransportImpl(RaftServer* raft, uint64_t id) 12 | : raft_(raft), 13 | id_(id) { 14 | } 15 | 16 | ~TransportImpl() final { 17 | if (io_thread_.joinable()) { 18 | io_thread_.join(); 19 | LOG_DEBUG("transport stopped"); 20 | } 21 | } 22 | 23 | void start(const std::string& host) final { 24 | server_ = IoServer::create((void*) &io_service_, host, raft_); 25 | server_->start(); 26 | 27 | io_thread_ = std::thread([this]() { 28 | this->io_service_.run(); 29 | }); 30 | } 31 | 32 | void add_peer(uint64_t id, const std::string& peer) final { 33 | LOG_DEBUG("node:%lu, peer:%lu, addr:%s", id_, id, peer.c_str()); 34 | std::lock_guard guard(mutex_); 35 | 36 | auto it = peers_.find(id); 37 | if (it != peers_.end()) { 38 | LOG_DEBUG("peer already exists %lu", id); 39 | return; 40 | } 41 | 42 | PeerPtr p = Peer::creat(id, peer, (void*) &io_service_); 43 | p->start(); 44 | peers_[id] = p; 45 | } 46 | 47 | void remove_peer(uint64_t id) final { 48 | LOG_WARN("no impl yet"); 49 | } 50 | 51 | void send(std::vector msgs) final { 52 | auto callback = [this](std::vector msgs) { 53 | for (proto::MessagePtr& msg : msgs) { 54 | if (msg->to == 0) { 55 | // ignore intentionally dropped message 56 | continue; 57 | } 58 | 59 | auto it = peers_.find(msg->to); 60 | if (it != peers_.end()) { 61 | it->second->send(msg); 62 | continue; 63 | } 64 | LOG_DEBUG("ignored message %d (sent to unknown peer %lu)", msg->type, msg->to); 65 | } 66 | }; 67 | io_service_.post(std::bind(callback, std::move(msgs))); 68 | } 69 | 70 | void stop() final { 71 | io_service_.stop(); 72 | } 73 | 74 | private: 75 | RaftServer* raft_; 76 | uint64_t id_; 77 | 78 | std::thread io_thread_; 79 | boost::asio::io_service io_service_; 80 | 81 | std::mutex mutex_; 82 | std::unordered_map peers_; 83 | 84 | IoServerPtr server_; 85 | }; 86 | 87 | std::shared_ptr Transport::create(RaftServer* raft, uint64_t id) { 88 | std::shared_ptr impl(new TransportImpl(raft, id)); 89 | return impl; 90 | } 91 | 92 | } 93 | -------------------------------------------------------------------------------- /raft-kv/common/slice.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace kv { 9 | 10 | class Slice { 11 | public: 12 | // Create an empty slice. 13 | Slice() 14 | : data_(""), size_(0) {} 15 | 16 | // Create a slice that refers to d[0,n-1]. 17 | Slice(const char* d, size_t n) 18 | : data_(d), size_(n) {} 19 | 20 | // Create a slice that refers to the contents of "s" 21 | Slice(const std::string& s) 22 | : data_(s.data()), size_(s.size()) {} 23 | 24 | // Create a slice that refers to s[0,strlen(s)-1] 25 | Slice(const char* s) 26 | : data_(s), size_(strlen(s)) {} 27 | 28 | // Return a pointer to the beginning of the referenced data 29 | const char* data() const { return data_; } 30 | 31 | // Return the length (in bytes) of the referenced data 32 | size_t size() const { return size_; } 33 | 34 | // Return true iff the length of the referenced data is zero 35 | bool empty() const { return size_ == 0; } 36 | 37 | // Return the ith byte in the referenced data. 38 | // REQUIRES: n < size() 39 | char operator[](size_t n) const { 40 | assert(n < size()); 41 | return data_[n]; 42 | } 43 | 44 | // Change this slice to refer to an empty array 45 | void clear() { 46 | data_ = ""; 47 | size_ = 0; 48 | } 49 | 50 | // Drop the first "n" bytes from this slice. 51 | void remove_prefix(size_t n) { 52 | assert(n <= size()); 53 | data_ += n; 54 | size_ -= n; 55 | } 56 | 57 | // Return a string that contains the copy of the referenced data. 58 | std::string to_string() const { return std::string(data_, size_); } 59 | 60 | // Three-way comparison. Returns value: 61 | // < 0 iff "*this" < "b", 62 | // == 0 iff "*this" == "b", 63 | // > 0 iff "*this" > "b" 64 | int compare(const Slice& b) const; 65 | 66 | // Return true iff "x" is a prefix of "*this" 67 | bool starts_with(const Slice& x) const { 68 | return ((size_ >= x.size_) && 69 | (memcmp(data_, x.data_, x.size_) == 0)); 70 | } 71 | 72 | private: 73 | const char* data_; 74 | size_t size_; 75 | 76 | // Intentionally copyable 77 | }; 78 | 79 | inline bool operator==(const Slice& x, const Slice& y) { 80 | return ((x.size() == y.size()) && 81 | (memcmp(x.data(), y.data(), x.size()) == 0)); 82 | } 83 | 84 | inline bool operator!=(const Slice& x, const Slice& y) { 85 | return !(x == y); 86 | } 87 | 88 | inline int Slice::compare(const Slice& b) const { 89 | const size_t min_len = (size_ < b.size_) ? size_ : b.size_; 90 | int r = memcmp(data_, b.data_, min_len); 91 | if (r == 0) { 92 | if (size_ < b.size_) r = -1; 93 | else if (size_ > b.size_) r = +1; 94 | } 95 | return r; 96 | } 97 | 98 | } -------------------------------------------------------------------------------- /tests/test_wal.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | using namespace kv; 5 | 6 | TEST(wal, wal_len) { 7 | WAL_Record record; 8 | 9 | for (int i = 0; i < MAX_WAL_RECORD_LEN; ++i) { 10 | set_WAL_Record_len(record, i); 11 | ASSERT_EQ(WAL_Record_len(record), i); 12 | } 13 | 14 | ASSERT_EQ(MAX_WAL_RECORD_LEN, 0xff << 16 | 0xff << 8 | 0xff); 15 | fprintf(stderr, "max wal record len %d\n", MAX_WAL_RECORD_LEN); 16 | 17 | uint32_t len = MAX_WAL_RECORD_LEN + 1; 18 | set_WAL_Record_len(record, len); 19 | ASSERT_EQ(WAL_Record_len(record), MAX_WAL_RECORD_LEN); 20 | } 21 | 22 | TEST(wal, ScanWalName) { 23 | struct Test { 24 | std::string str; 25 | uint64_t wseq; 26 | uint64_t windex; 27 | bool wok; 28 | }; 29 | 30 | std::vector test; 31 | 32 | test.push_back(Test{.str = "0000000000000000-0000000000000000.wal", .wseq = 0, .windex = 0, .wok = true}); 33 | test.push_back(Test{.str = "0000000000000100-0000000000000101.wal", .wseq = 0x100, .windex = 0x101, .wok = true}); 34 | test.push_back(Test{.str = "0000000000000000.wa", .wseq = 0, .windex = 0, .wok = false}); 35 | test.push_back(Test{.str = "0000000000000000-0000000000000000.snap", .wseq = 0, .windex = 0, .wok = false}); 36 | for (auto& test : test) { 37 | uint64_t seq; 38 | uint64_t index; 39 | bool ok = WAL::parse_wal_name(test.str, &seq, &index); 40 | 41 | ASSERT_EQ(ok, test.wok); 42 | ASSERT_EQ(seq, test.wseq); 43 | ASSERT_EQ(index, test.windex); 44 | } 45 | } 46 | 47 | TEST(wal, SearchIndex) { 48 | struct Test { 49 | std::vector names; 50 | uint64_t index; 51 | int widx; 52 | bool wok; 53 | }; 54 | 55 | std::vector test; 56 | { 57 | Test t; 58 | t.names = {"0000000000000000-0000000000000000.wal", "0000000000000001-0000000000001000.wal", "0000000000000002-0000000000002000.wal"}; 59 | t.index = 0x1000; 60 | t.widx = 1; 61 | t.wok = true; 62 | //test.push_back(t); 63 | } 64 | { 65 | Test t; 66 | t.names = {"0000000000000001-0000000000004000.wal", "0000000000000002-0000000000003000.wal", "0000000000000003-0000000000005000.wal"}; 67 | t.index = 0x4000; 68 | t.widx = 1; 69 | t.wok = true; 70 | //test.push_back(t); 71 | } 72 | { 73 | Test t; 74 | t.names = {"0000000000000001-0000000000002000.wal", "0000000000000002-0000000000003000.wal", "0000000000000003-0000000000005000.wal"}; 75 | t.index = 0x1000; 76 | t.widx = -1; 77 | t.wok = false; 78 | test.push_back(t); 79 | } 80 | 81 | for (auto& test : test) { 82 | uint64_t i; 83 | bool ok = WAL::search_index(test.names, test.index, &i); 84 | ASSERT_EQ(ok, test.wok); 85 | ASSERT_EQ(i, test.widx); 86 | } 87 | } 88 | 89 | int main(int argc, char* argv[]) { 90 | testing::InitGoogleTest(&argc, argv); 91 | return RUN_ALL_TESTS(); 92 | } 93 | -------------------------------------------------------------------------------- /raft-kv/raft/ready.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | namespace kv { 6 | 7 | Ready::Ready(std::shared_ptr raft, SoftStatePtr pre_soft_state, const proto::HardState& pre_hard_state) 8 | : entries(raft->raft_log_->unstable_entries()) { 9 | std::swap(this->messages, raft->msgs_); 10 | 11 | raft->raft_log_->next_entries(committed_entries); 12 | 13 | SoftStatePtr st = raft->soft_state(); 14 | if (!st->equal(*pre_soft_state)) { 15 | this->soft_state = st; 16 | } 17 | 18 | proto::HardState hs = raft->hard_state(); 19 | if (!hs.equal(pre_hard_state)) { 20 | this->hard_state = hs; 21 | } 22 | 23 | proto::SnapshotPtr snapshot = raft->raft_log_->unstable_->snapshot_; 24 | if (snapshot) { 25 | //copy 26 | this->snapshot = *snapshot; 27 | } 28 | if (!raft->read_states_.empty()) { 29 | this->read_states = raft->read_states_; 30 | } 31 | 32 | this->must_sync = is_must_sync(hs, hard_state, entries.size()); 33 | } 34 | 35 | bool Ready::contains_updates() const { 36 | return soft_state != nullptr || !hard_state.is_empty_state() || 37 | !snapshot.is_empty() || !entries.empty() || 38 | !committed_entries.empty() || !messages.empty() || read_states.empty(); 39 | } 40 | 41 | uint64_t Ready::applied_cursor() const { 42 | if (!committed_entries.empty()) { 43 | return committed_entries.back()->index; 44 | } 45 | uint64_t index = snapshot.metadata.index; 46 | if (index > 0) { 47 | return index; 48 | } 49 | return 0; 50 | } 51 | 52 | bool Ready::equal(const Ready& rd) const { 53 | if ((soft_state && !rd.soft_state) || (!soft_state && rd.soft_state)) { 54 | return false; 55 | } 56 | if (soft_state && rd.soft_state && !soft_state->equal(*rd.soft_state)) { 57 | return false; 58 | } 59 | if (!hard_state.equal(rd.hard_state)) { 60 | return false; 61 | } 62 | 63 | if (read_states.size() != read_states.size()) { 64 | return false; 65 | } 66 | 67 | for (size_t i = 0; i < read_states.size(); ++i) { 68 | if (!read_states[i].equal(rd.read_states[i])) { 69 | return false; 70 | } 71 | } 72 | 73 | if (entries.size() != rd.entries.size()) { 74 | return false; 75 | } 76 | 77 | for (size_t i = 0; i < entries.size(); ++i) { 78 | if (*entries[i] != *rd.entries[i]) { 79 | return false; 80 | } 81 | } 82 | 83 | if (!snapshot.equal(rd.snapshot)) { 84 | return false; 85 | } 86 | 87 | if (committed_entries.size() != rd.committed_entries.size()) { 88 | return false; 89 | } 90 | 91 | for (size_t i = 0; i < committed_entries.size(); ++i) { 92 | if (*committed_entries[i] != *rd.committed_entries[i]) { 93 | return false; 94 | } 95 | } 96 | return must_sync == rd.must_sync; 97 | } 98 | 99 | } 100 | 101 | -------------------------------------------------------------------------------- /raft-kv/raft/ready.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | namespace kv { 6 | 7 | enum RaftState { 8 | Follower = 0, 9 | Candidate = 1, 10 | Leader = 2, 11 | PreCandidate = 3, 12 | }; 13 | 14 | // soft_state provides state that is useful for logging and debugging. 15 | // The state is volatile and does not need to be persisted to the wal. 16 | struct SoftState { 17 | explicit SoftState(uint64_t lead, RaftState state) 18 | : lead(lead), state(state) { 19 | 20 | } 21 | 22 | bool equal(const SoftState& ss) const { 23 | return lead == ss.lead && state == ss.state; 24 | } 25 | 26 | uint64_t lead; // must use atomic operations to access; keep 64-bit aligned. 27 | RaftState state; 28 | }; 29 | typedef std::shared_ptr SoftStatePtr; 30 | class Raft; 31 | 32 | struct Ready { 33 | Ready() 34 | : must_sync(false) {} 35 | 36 | explicit Ready(std::shared_ptr raft, SoftStatePtr pre_soft_state, const proto::HardState& pre_hard_state); 37 | 38 | bool contains_updates() const; 39 | 40 | bool equal(const Ready& rd) const; 41 | 42 | // applied_cursor extracts from the Ready the highest index the client has 43 | // applied (once the Ready is confirmed via Advance). If no information is 44 | // contained in the Ready, returns zero. 45 | uint64_t applied_cursor() const; 46 | 47 | // The current volatile state of a Node. 48 | // soft_state will be nil if there is no update. 49 | // It is not required to consume or store soft_state. 50 | SoftStatePtr soft_state; 51 | 52 | // The current state of a Node to be saved to stable storage BEFORE 53 | // messages are sent. 54 | // hard_state will be equal to empty state if there is no update. 55 | proto::HardState hard_state; 56 | 57 | // read_states can be used for node to serve linearizable read requests locally 58 | // when its applied index is greater than the index in ReadState. 59 | // Note that the readState will be returned when raft receives msgReadIndex. 60 | // The returned is only valid for the request that requested to read. 61 | std::vector read_states; 62 | 63 | // entries specifies entries to be saved to stable storage BEFORE 64 | // messages are sent 65 | std::vector entries; 66 | 67 | // Snapshot specifies the snapshot to be saved to stable storage. 68 | proto::Snapshot snapshot; 69 | 70 | // committed_entries specifies entries to be committed to a 71 | // store/state-machine. These have previously been committed to stable 72 | // store. 73 | std::vector committed_entries; 74 | 75 | // messages specifies outbound messages to be sent AFTER entries are 76 | // committed to stable storage. 77 | // If it contains a MsgSnap message, the application MUST report back to raft 78 | // when the snapshot has been received or has failed by calling ReportSnapshot. 79 | std::vector messages; 80 | 81 | // must_sync indicates whether the hard_state and entries must be synchronously 82 | // written to disk or if an asynchronous write is permissible. 83 | bool must_sync; 84 | }; 85 | typedef std::shared_ptr ReadyPtr; 86 | 87 | } 88 | -------------------------------------------------------------------------------- /raft-kv/wal/wal.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace kv { 8 | 9 | struct WAL_Snapshot { 10 | uint64_t index; 11 | uint64_t term; 12 | MSGPACK_DEFINE (index, term); 13 | }; 14 | 15 | typedef uint8_t WAL_type; 16 | 17 | #pragma pack(1) 18 | struct WAL_Record { 19 | WAL_type type; /*the data type*/ 20 | uint8_t len[3]; /*the data length, max len: 0x00FFFFFF*/ 21 | uint32_t crc; /*crc32 for data*/ 22 | char data[0]; 23 | }; 24 | #pragma pack() 25 | 26 | #define MAX_WAL_RECORD_LEN (0x00FFFFFF) 27 | 28 | static inline uint32_t WAL_Record_len(const WAL_Record& record) { 29 | return uint32_t(record.len[2]) << 16 | uint32_t(record.len[1]) << 8 | uint32_t(record.len[0]) << 0; 30 | } 31 | 32 | static inline void set_WAL_Record_len(WAL_Record& record, uint32_t len) { 33 | len = std::min(len, (uint32_t) MAX_WAL_RECORD_LEN); 34 | record.len[2] = (len >> 16) & 0x000000FF; 35 | record.len[1] = (len >> 8) & 0x000000FF; 36 | record.len[0] = (len >> 0) & 0x000000FF; 37 | } 38 | 39 | class WAL_File; 40 | 41 | class WAL; 42 | typedef std::shared_ptr WAL_ptr; 43 | class WAL { 44 | public: 45 | static void create(const std::string& dir); 46 | 47 | static WAL_ptr open(const std::string& dir, const WAL_Snapshot& snap); 48 | 49 | ~WAL() = default; 50 | 51 | //After read_all, the WAL will be ready for appending new records. 52 | Status read_all(proto::HardState& hs, std::vector& ents); 53 | 54 | Status save(proto::HardState hs, const std::vector& ents); 55 | 56 | Status save_snapshot(const WAL_Snapshot& snap); 57 | 58 | Status save_entry(const proto::Entry& entry); 59 | 60 | Status save_hard_state(const proto::HardState& hs); 61 | 62 | Status cut(); 63 | 64 | // release_to releases the wal file, which has smaller index than the given index 65 | // except the largest one among them. 66 | // For example, if WAL is holding lock 1,2,3,4,5,6, release_to(4) will release 67 | // lock 1,2 but keep 3. release_to(5) will release 1,2,3 but keep 4. 68 | Status release_to(uint64_t index); 69 | 70 | void get_wal_names(const std::string& dir, std::vector& names); 71 | 72 | static bool parse_wal_name(const std::string& name, uint64_t* seq, uint64_t* index); 73 | 74 | // names should have been sorted based on sequence number. 75 | // is_valid_seq checks whether seq increases continuously. 76 | static bool is_valid_seq(const std::vector& names); 77 | 78 | static bool search_index(const std::vector& names, uint64_t index, uint64_t* name_index); 79 | 80 | private: 81 | explicit WAL(const std::string& dir) 82 | : dir_(dir), 83 | enti_(0) { 84 | memset(&start_, 0, sizeof(start_)); 85 | } 86 | 87 | void handle_record_wal_record(WAL_type type, 88 | const char* data, 89 | size_t data_len, 90 | bool& matchsnap, 91 | proto::HardState& hs, 92 | std::vector& ents); 93 | 94 | std::string dir_; 95 | proto::HardState state_; // hardstate recorded at the head of WAL 96 | WAL_Snapshot start_; // snapshot to start reading 97 | uint64_t enti_; // index of the last entry saved to the wal 98 | std::vector> files_; 99 | 100 | }; 101 | 102 | } 103 | -------------------------------------------------------------------------------- /tests/test_msgpack.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | class MyClass { 6 | public: 7 | std::string str; 8 | std::vector vec; 9 | public: 10 | MSGPACK_DEFINE (str, vec); 11 | }; 12 | 13 | TEST(test_msgpack, test_msgpack) { 14 | std::vector vec; 15 | 16 | MyClass my; 17 | my.str = "abc"; 18 | my.vec.push_back(1); 19 | my.vec.push_back(3); 20 | 21 | vec.push_back(std::move(my)); 22 | 23 | msgpack::sbuffer sbuf; 24 | msgpack::pack(sbuf, vec); 25 | 26 | msgpack::object_handle oh = msgpack::unpack(sbuf.data(), sbuf.size()); 27 | 28 | msgpack::object obj = oh.get(); 29 | std::vector rvec; 30 | obj.convert(rvec); 31 | 32 | ASSERT_TRUE(rvec.size() == 1); 33 | MyClass& out = rvec[0]; 34 | ASSERT_TRUE(out.str == "abc"); 35 | ASSERT_TRUE(out.vec.size() == 2); 36 | ASSERT_TRUE(out.vec[0] == 1); 37 | ASSERT_TRUE(out.vec[1] == 3); 38 | } 39 | 40 | TEST(test_msgpack, test_error) { 41 | std::vector vec; 42 | 43 | MyClass my; 44 | my.str = "abc"; 45 | my.vec.push_back(1); 46 | my.vec.push_back(3); 47 | 48 | vec.push_back(std::move(my)); 49 | 50 | msgpack::sbuffer sbuf; 51 | msgpack::pack(sbuf, vec); 52 | 53 | msgpack::object_handle oh = msgpack::unpack(sbuf.data(), sbuf.size()); 54 | 55 | msgpack::object obj = oh.get(); 56 | std::string out; 57 | ASSERT_ANY_THROW(obj.convert(out)); 58 | 59 | } 60 | 61 | class B { 62 | public: 63 | int a; 64 | public: 65 | MSGPACK_DEFINE (a); 66 | }; 67 | 68 | TEST(msgpack, entry_size) { 69 | using namespace kv::proto; 70 | 71 | Entry entry; 72 | entry.type = 10; 73 | entry.term = 10; 74 | entry.index = 10; 75 | { 76 | msgpack::sbuffer sbuf; 77 | msgpack::pack(sbuf, entry); 78 | ASSERT_TRUE(entry.serialize_size() == sbuf.size()); 79 | } 80 | 81 | { 82 | entry.type = 255; 83 | msgpack::sbuffer sbuf; 84 | msgpack::pack(sbuf, entry); 85 | ASSERT_TRUE(entry.serialize_size() == sbuf.size()); 86 | } 87 | 88 | { 89 | entry.term = std::numeric_limits::max() - 10; 90 | msgpack::sbuffer sbuf; 91 | msgpack::pack(sbuf, entry); 92 | ASSERT_TRUE(entry.serialize_size() == sbuf.size()); 93 | } 94 | 95 | { 96 | entry.term = std::numeric_limits::max() + 10; 97 | msgpack::sbuffer sbuf; 98 | msgpack::pack(sbuf, entry); 99 | ASSERT_TRUE(entry.serialize_size() == sbuf.size()); 100 | } 101 | 102 | { 103 | entry.term = std::numeric_limits::max() + 10; 104 | msgpack::sbuffer sbuf; 105 | msgpack::pack(sbuf, entry); 106 | ASSERT_TRUE(entry.serialize_size() == sbuf.size()); 107 | } 108 | 109 | { 110 | entry.term = std::numeric_limits::max() + 10; 111 | msgpack::sbuffer sbuf; 112 | msgpack::pack(sbuf, entry); 113 | ASSERT_TRUE(entry.serialize_size() == sbuf.size()); 114 | } 115 | 116 | { 117 | entry.data.resize(std::numeric_limits::max() - 1); 118 | msgpack::sbuffer sbuf; 119 | msgpack::pack(sbuf, entry); 120 | ASSERT_TRUE(entry.serialize_size() == sbuf.size()); 121 | } 122 | 123 | { 124 | entry.data.resize(std::numeric_limits::max() - 1); 125 | msgpack::sbuffer sbuf; 126 | msgpack::pack(sbuf, entry); 127 | ASSERT_TRUE(entry.serialize_size() == sbuf.size()); 128 | } 129 | 130 | { 131 | entry.data.resize(std::numeric_limits::max() + 1); 132 | msgpack::sbuffer sbuf; 133 | msgpack::pack(sbuf, entry); 134 | ASSERT_EQ(entry.serialize_size(), sbuf.size()); 135 | } 136 | } 137 | 138 | int main(int argc, char* argv[]) { 139 | testing::InitGoogleTest(&argc, argv); 140 | return RUN_ALL_TESTS(); 141 | } -------------------------------------------------------------------------------- /raft-kv/raft/unstable.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | namespace kv { 5 | 6 | void Unstable::maybe_first_index(uint64_t& index, bool& ok) { 7 | if (snapshot_) { 8 | ok = true; 9 | index = snapshot_->metadata.index + 1; 10 | } else { 11 | ok = false; 12 | index = 0; 13 | } 14 | } 15 | 16 | void Unstable::maybe_last_index(uint64_t& index, bool& ok) { 17 | if (!entries_.empty()) { 18 | ok = true; 19 | index = offset_ + entries_.size() - 1; 20 | return; 21 | } 22 | if (snapshot_) { 23 | ok = true; 24 | index = snapshot_->metadata.index; 25 | return; 26 | } 27 | index = 0; 28 | ok = false; 29 | } 30 | 31 | void Unstable::maybe_term(uint64_t index, uint64_t& term, bool& ok) { 32 | term = 0; 33 | ok = false; 34 | 35 | if (index < offset_) { 36 | if (!snapshot_) { 37 | return; 38 | } 39 | if (snapshot_->metadata.index == index) { 40 | term = snapshot_->metadata.term; 41 | ok = true; 42 | return; 43 | } 44 | return; 45 | } 46 | 47 | uint64_t last = 0; 48 | bool last_ok = false; 49 | maybe_last_index(last, last_ok); 50 | if (!last_ok) { 51 | return; 52 | } 53 | if (index > last) { 54 | return; 55 | 56 | } 57 | ok = true; 58 | term = entries_[index - offset_]->term; 59 | } 60 | 61 | void Unstable::stable_to(uint64_t index, uint64_t term) { 62 | uint64_t gt = 0; 63 | bool ok = false; 64 | maybe_term(index, gt, ok); 65 | 66 | if (!ok) { 67 | return; 68 | } 69 | // if index < offset, term is matched with the snapshot 70 | // only update the unstable entries if term is matched with 71 | // an unstable entry. 72 | if (gt == term && index >= offset_) { 73 | uint64_t n = index + 1 - offset_; 74 | entries_.erase(entries_.begin(), entries_.begin() + n); 75 | offset_ = index + 1; 76 | } 77 | } 78 | 79 | void Unstable::stable_snap_to(uint64_t index) { 80 | if (snapshot_ && snapshot_->metadata.index == index) { 81 | snapshot_ = nullptr; 82 | } 83 | } 84 | 85 | void Unstable::restore(proto::SnapshotPtr snapshot) { 86 | offset_ = snapshot->metadata.index + 1; 87 | entries_.clear(); 88 | snapshot_ = snapshot; 89 | } 90 | 91 | void Unstable::truncate_and_append(std::vector entries) { 92 | if (entries.empty()) { 93 | return; 94 | } 95 | uint64_t after = entries[0]->index; 96 | if (after == offset_ + entries_.size()) { 97 | // directly append 98 | entries_.insert(entries_.end(), entries.begin(), entries.end()); 99 | } else if (after <= offset_) { 100 | // The log is being truncated to before our current offset 101 | // portion, so set the offset and replace the entries 102 | LOG_INFO("replace the unstable entries from index %lu", after); 103 | offset_ = after; 104 | entries_ = std::move(entries); 105 | } else { 106 | // truncate to after and copy entries_ 107 | // then append 108 | LOG_INFO("truncate the unstable entries before index %lu", after); 109 | std::vector entries_slice; 110 | this->slice(offset_, after, entries_slice); 111 | 112 | entries_slice.insert(entries_slice.end(), entries.begin(), entries.end()); 113 | entries_ = std::move(entries_slice); 114 | } 115 | } 116 | 117 | void Unstable::slice(uint64_t low, uint64_t high, std::vector& entries) { 118 | assert(high > low); 119 | uint64_t upper = offset_ + entries_.size(); 120 | if (low < offset_ || high > upper) { 121 | LOG_FATAL("unstable.slice[%lu,%lu) out of bound [%lu,%lu]", low, high, offset_, upper); 122 | } 123 | 124 | entries.insert(entries.end(), entries_.begin() + low - offset_, entries_.begin() + high - offset_); 125 | } 126 | 127 | } 128 | -------------------------------------------------------------------------------- /raft-kv/snap/snapshotter.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace kv { 9 | 10 | struct SnapshotRecord { 11 | uint32_t data_len; 12 | uint32_t crc32; 13 | char data[0]; 14 | }; 15 | 16 | Status Snapshotter::load(proto::Snapshot& snapshot) { 17 | std::vector names; 18 | get_snap_names(names); 19 | 20 | for (std::string& filename : names) { 21 | Status status = load_snap(filename, snapshot); 22 | if (status.is_ok()) { 23 | return Status::ok(); 24 | } 25 | } 26 | 27 | return Status::not_found("snap not found"); 28 | } 29 | 30 | std::string Snapshotter::snap_name(uint64_t term, uint64_t index) { 31 | char buffer[64]; 32 | snprintf(buffer, sizeof(buffer), "%016" PRIx64 "-%016" PRIx64 ".snap", term, index); 33 | return buffer; 34 | } 35 | 36 | Status Snapshotter::save_snap(const proto::Snapshot& snapshot) { 37 | Status status; 38 | msgpack::sbuffer sbuf; 39 | msgpack::pack(sbuf, snapshot); 40 | 41 | SnapshotRecord* record = (SnapshotRecord*) malloc(sbuf.size() + sizeof(SnapshotRecord)); 42 | record->data_len = sbuf.size(); 43 | record->crc32 = compute_crc32(sbuf.data(), sbuf.size()); 44 | memcpy(record->data, sbuf.data(), sbuf.size()); 45 | 46 | char save_path[128]; 47 | snprintf(save_path, 48 | sizeof(save_path), 49 | "%s/%s", 50 | dir_.c_str(), 51 | snap_name(snapshot.metadata.term, snapshot.metadata.index).c_str()); 52 | 53 | FILE* fp = fopen(save_path, "w"); 54 | if (!fp) { 55 | free(record); 56 | return Status::io_error(strerror(errno)); 57 | } 58 | 59 | size_t bytes = sizeof(SnapshotRecord) + record->data_len; 60 | if (fwrite((void*) record, 1, bytes, fp) != bytes) { 61 | status = Status::io_error(strerror(errno)); 62 | } 63 | free(record); 64 | fclose(fp); 65 | 66 | return status; 67 | } 68 | 69 | void Snapshotter::get_snap_names(std::vector& names) { 70 | using namespace boost; 71 | 72 | filesystem::directory_iterator end; 73 | for (boost::filesystem::directory_iterator it(dir_); it != end; it++) { 74 | filesystem::path filename = (*it).path().filename(); 75 | filesystem::path extension = filename.extension(); 76 | if (extension != ".snap") { 77 | continue; 78 | } 79 | names.push_back(filename.string()); 80 | } 81 | std::sort(names.begin(), names.end(), std::greater()); 82 | } 83 | 84 | Status Snapshotter::load_snap(const std::string& filename, proto::Snapshot& snapshot) { 85 | using namespace boost; 86 | SnapshotRecord snap_hdr; 87 | std::vector data; 88 | filesystem::path path = filesystem::path(dir_) / filename; 89 | FILE* fp = fopen(path.c_str(), "r"); 90 | 91 | if (!fp) { 92 | goto invalid_snap; 93 | } 94 | 95 | if (fread(&snap_hdr, 1, sizeof(SnapshotRecord), fp) != sizeof(SnapshotRecord)) { 96 | goto invalid_snap; 97 | } 98 | 99 | if (snap_hdr.data_len == 0 || snap_hdr.crc32 == 0) { 100 | goto invalid_snap; 101 | } 102 | 103 | data.resize(snap_hdr.data_len); 104 | if (fread(data.data(), 1, snap_hdr.data_len, fp) != snap_hdr.data_len) { 105 | goto invalid_snap; 106 | } 107 | 108 | fclose(fp); 109 | fp = NULL; 110 | if (compute_crc32(data.data(), data.size()) != snap_hdr.crc32) { 111 | goto invalid_snap; 112 | } 113 | 114 | try { 115 | 116 | msgpack::object_handle oh = msgpack::unpack((const char*) data.data(), data.size()); 117 | oh.get().convert(snapshot); 118 | return Status::ok(); 119 | 120 | } catch (std::exception& e) { 121 | goto invalid_snap; 122 | } 123 | 124 | invalid_snap: 125 | if (fp) { 126 | fclose(fp); 127 | } 128 | LOG_INFO("broken snapshot %s", path.string().c_str()); 129 | filesystem::rename(path, path.string() + ".broken"); 130 | return Status::io_error("unexpected empty snapshot"); 131 | } 132 | 133 | } 134 | -------------------------------------------------------------------------------- /tests/raft_snap_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "network.hpp" 6 | 7 | using namespace kv; 8 | 9 | static proto::Snapshot& testingSnap() { 10 | static proto::SnapshotPtr snapshot; 11 | if (!snapshot) { 12 | snapshot = std::make_shared(); 13 | snapshot->metadata.index = 11; 14 | snapshot->metadata.term = 11; 15 | snapshot->metadata.conf_state.nodes.push_back(1); 16 | snapshot->metadata.conf_state.nodes.push_back(2); 17 | 18 | } 19 | return *snapshot; 20 | } 21 | 22 | TEST(snap, SendingSnapshotSetPendingSnapshot) { 23 | auto storage = std::make_shared(); 24 | auto sm = newTestRaft(1, {1}, 10, 1, storage); 25 | sm->restore(testingSnap()); 26 | 27 | sm->become_candidate(); 28 | sm->become_leader(); 29 | 30 | // force set the next of node 1, so that 31 | // node 1 needs a snapshot 32 | sm->prs_[2]->next = sm->raft_log_->first_index(); 33 | 34 | proto::MessagePtr msg(new proto::Message()); 35 | msg->from = 2; 36 | msg->to = 1; 37 | msg->type = proto::MsgAppResp; 38 | msg->index = sm->prs_[2]->next - 1; 39 | msg->reject = true; 40 | 41 | sm->step(msg); 42 | 43 | ASSERT_EQ(sm->prs_[2]->pending_snapshot, 11); 44 | } 45 | 46 | TEST(snap, PendingSnapshotPauseReplication) { 47 | auto storage = std::make_shared(); 48 | auto sm = newTestRaft(1, {1, 2}, 10, 1, storage); 49 | sm->restore(testingSnap()); 50 | 51 | sm->become_candidate(); 52 | sm->become_leader(); 53 | 54 | sm->prs_[2]->become_snapshot(11); 55 | 56 | proto::MessagePtr msg(new proto::Message()); 57 | msg->from = 1; 58 | msg->to = 1; 59 | msg->type = proto::MsgProp; 60 | proto::Entry entry; 61 | entry.data = str_to_vector("somedata"); 62 | msg->entries.push_back(entry); 63 | 64 | sm->step(msg); 65 | 66 | auto msgs = sm->read_messages(); 67 | ASSERT_TRUE(msgs.empty()); 68 | } 69 | 70 | TEST(snap, SnapshotFailure) { 71 | auto storage = std::make_shared(); 72 | auto sm = newTestRaft(1, {1, 2}, 10, 1, storage); 73 | sm->restore(testingSnap()); 74 | 75 | sm->become_candidate(); 76 | sm->become_leader(); 77 | 78 | sm->prs_[2]->next = 1; 79 | sm->prs_[2]->become_snapshot(11); 80 | 81 | proto::MessagePtr msg(new proto::Message()); 82 | msg->from = 2; 83 | msg->to = 1; 84 | msg->type = proto::MsgSnapStatus; 85 | msg->reject = true; 86 | 87 | sm->step(msg); 88 | 89 | ASSERT_EQ(sm->prs_[2]->pending_snapshot, 0); 90 | ASSERT_EQ(sm->prs_[2]->next, 1); 91 | ASSERT_TRUE(sm->prs_[2]->paused); 92 | } 93 | 94 | TEST(snap, SnapshotSucceed) { 95 | auto storage = std::make_shared(); 96 | auto sm = newTestRaft(1, {1, 2}, 10, 1, storage); 97 | sm->restore(testingSnap()); 98 | 99 | sm->become_candidate(); 100 | sm->become_leader(); 101 | 102 | sm->prs_[2]->next = 1; 103 | sm->prs_[2]->become_snapshot(11); 104 | 105 | proto::MessagePtr msg(new proto::Message()); 106 | msg->from = 2; 107 | msg->to = 1; 108 | msg->type = proto::MsgSnapStatus; 109 | msg->reject = false; 110 | 111 | sm->step(msg); 112 | 113 | ASSERT_EQ(sm->prs_[2]->pending_snapshot, 0); 114 | ASSERT_EQ(sm->prs_[2]->next, 12); 115 | ASSERT_TRUE(sm->prs_[2]->paused); 116 | } 117 | 118 | TEST(snap, SnapshotAbort) { 119 | auto storage = std::make_shared(); 120 | auto sm = newTestRaft(1, {1, 2}, 10, 1, storage); 121 | sm->restore(testingSnap()); 122 | 123 | sm->become_candidate(); 124 | sm->become_leader(); 125 | 126 | sm->prs_[2]->next = 1; 127 | sm->prs_[2]->become_snapshot(11); 128 | 129 | // A successful msgAppResp that has a higher/equal index than the 130 | // pending snapshot should abort the pending snapshot. 131 | proto::MessagePtr msg(new proto::Message()); 132 | msg->from = 2; 133 | msg->to = 1; 134 | msg->type = proto::MsgAppResp; 135 | msg->index = 11; 136 | 137 | sm->step(msg); 138 | 139 | ASSERT_EQ(sm->prs_[2]->pending_snapshot, 0); 140 | ASSERT_EQ(sm->prs_[2]->next, 12); 141 | } 142 | 143 | int main(int argc, char* argv[]) { 144 | testing::InitGoogleTest(&argc, argv); 145 | return RUN_ALL_TESTS(); 146 | } 147 | -------------------------------------------------------------------------------- /raft-kv/raft/proto.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | 6 | namespace kv { 7 | namespace proto { 8 | 9 | const char* msg_type_to_string(MessageType type) { 10 | switch (type) { 11 | case MsgHup: { 12 | return "MsgHup"; 13 | } 14 | case MsgBeat: { 15 | return "MsgBeat"; 16 | } 17 | case MsgProp: { 18 | return "MsgProp"; 19 | } 20 | case MsgApp: { 21 | return "MsgApp"; 22 | } 23 | case MsgAppResp: { 24 | return "MsgAppResp"; 25 | } 26 | case MsgVote: { 27 | return "MsgVote"; 28 | } 29 | case MsgVoteResp: { 30 | return "MsgVoteResp"; 31 | } 32 | case MsgSnap: { 33 | return "MsgSnap"; 34 | } 35 | case MsgHeartbeat: { 36 | return "MsgHeartbeat"; 37 | } 38 | case MsgHeartbeatResp: { 39 | return "MsgHeartbeatResp"; 40 | } 41 | case MsgUnreachable: { 42 | return "MsgUnreachable"; 43 | } 44 | case MsgSnapStatus: { 45 | return "MsgSnapStatus"; 46 | } 47 | case MsgCheckQuorum: { 48 | return "MsgCheckQuorum"; 49 | } 50 | case MsgTransferLeader: { 51 | return "MsgTransferLeader"; 52 | } 53 | case MsgTimeoutNow: { 54 | return "MsgTimeoutNow"; 55 | } 56 | case MsgReadIndex: { 57 | return "MsgReadIndex"; 58 | } 59 | case MsgReadIndexResp: { 60 | return "MsgReadIndexResp"; 61 | } 62 | case MsgPreVote: { 63 | return "MsgPreVote"; 64 | } 65 | case MsgPreVoteResp: { 66 | return "MsgPreVoteResp"; 67 | } 68 | default: { 69 | LOG_FATAL("invalid msg type %d", type); 70 | } 71 | } 72 | } 73 | 74 | const char* entry_type_to_string(EntryType type) { 75 | switch (type) { 76 | case EntryNormal: { 77 | return "EntryNormal"; 78 | } 79 | case EntryConfChange: { 80 | return "EntryConfChange"; 81 | } 82 | default: { 83 | LOG_FATAL("invalid entry type %d", type); 84 | } 85 | } 86 | } 87 | 88 | bool Message::is_local_msg() const { 89 | return type == MsgHup || type == MsgBeat || type == MsgUnreachable || type == MsgSnapStatus 90 | || type == MsgCheckQuorum; 91 | } 92 | 93 | bool Message::is_response_msg() const { 94 | return type == MsgAppResp || type == MsgVoteResp || type == MsgHeartbeatResp || type == MsgUnreachable 95 | || type == MsgPreVoteResp; 96 | } 97 | 98 | // detail: https://github.com/msgpack/msgpack/blob/master/spec.md#str-format-family 99 | static uint32_t u8_serialize_size(uint8_t d) { 100 | if (d < (1 << 7)) { 101 | /* fixnum */ 102 | return 1; 103 | } else { 104 | /* unsigned 8 */ 105 | return 2; 106 | } 107 | } 108 | 109 | static uint32_t u64_serialize_size(uint64_t d) { 110 | if (d < (1ULL << 8)) { 111 | if (d < (1ULL << 7)) { 112 | /* fixnum */ 113 | return 1; 114 | } else { 115 | /* unsigned 8 */ 116 | return 2; 117 | } 118 | } else { 119 | if (d < (1ULL << 16)) { 120 | /* unsigned 16 */ 121 | return 3; 122 | } else if (d < (1ULL << 32)) { 123 | /* unsigned 32 */ 124 | return 5; 125 | } else { 126 | /* unsigned 64 */ 127 | return 9; 128 | } 129 | } 130 | } 131 | 132 | static uint32_t data_serialize_size(uint32_t len) { 133 | 134 | if (len <= std::numeric_limits::max()) { 135 | // (2^8)-1 136 | return 2 + len; 137 | } 138 | if (len <= std::numeric_limits::max()) { 139 | //(2^16)-1 140 | return 3 + len; 141 | } 142 | if (len <= std::numeric_limits::max()) { 143 | return 5 + len; 144 | } 145 | assert(false); 146 | } 147 | 148 | uint32_t Entry::serialize_size() const { 149 | return 1 + u8_serialize_size(type) 150 | + u64_serialize_size(term) 151 | + u64_serialize_size(index) 152 | + data_serialize_size(static_cast(data.size())); 153 | } 154 | 155 | void ConfChange::from_data(const std::vector& data, ConfChange& cc) { 156 | msgpack::object_handle oh = msgpack::unpack((const char*) data.data(), data.size()); 157 | oh.get().convert(cc); 158 | } 159 | 160 | std::vector ConfChange::serialize() const { 161 | msgpack::sbuffer sbuf; 162 | msgpack::pack(sbuf, *this); 163 | return std::vector(sbuf.data(), sbuf.data() + sbuf.size()); 164 | } 165 | 166 | bool Snapshot::equal(const Snapshot& snap) const { 167 | return data == snap.data && metadata == snap.metadata; 168 | } 169 | 170 | } 171 | } -------------------------------------------------------------------------------- /raft-kv/raft/storage.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace kv { 8 | 9 | class Storage { 10 | public: 11 | ~Storage() = default; 12 | 13 | // initial_state returns the saved hard_state and ConfState information. 14 | virtual Status initial_state(proto::HardState& hard_state, proto::ConfState& conf_state) = 0; 15 | 16 | // entries returns a slice of log entries in the range [low,high). 17 | // MaxSize limits the total size of the log entries returned, but 18 | // entries returns at least one entry if any. 19 | virtual Status entries(uint64_t low, 20 | uint64_t high, 21 | uint64_t max_size, 22 | std::vector& entries) = 0; 23 | 24 | // Term returns the term of entry i, which must be in the range 25 | // [FirstIndex()-1, LastIndex()]. The term of the entry before 26 | // FirstIndex is retained for matching purposes even though the 27 | // rest of that entry may not be available. 28 | virtual Status term(uint64_t i, uint64_t& term) = 0; 29 | 30 | // LastIndex returns the index of the last entry in the log. 31 | virtual Status last_index(uint64_t& index) = 0; 32 | 33 | // firstIndex returns the index of the first log entry that is 34 | // possibly available via entries (older entries have been incorporated 35 | // into the latest Snapshot; if storage only contains the dummy entry the 36 | // first log entry is not available). 37 | virtual Status first_index(uint64_t& index) = 0; 38 | 39 | // Snapshot returns the most recent snapshot. 40 | // If snapshot is temporarily unavailable, it should return ErrSnapshotTemporarilyUnavailable, 41 | // so raft state machine could know that Storage needs some time to prepare 42 | // snapshot and call Snapshot later. 43 | virtual Status snapshot(proto::SnapshotPtr& snapshot) = 0; 44 | }; 45 | typedef std::shared_ptr StoragePtr; 46 | 47 | // MemoryStorage implements the Storage interface backed by an 48 | // in-memory array. 49 | class MemoryStorage : public Storage { 50 | public: 51 | 52 | // creates an empty MemoryStorage 53 | explicit MemoryStorage() 54 | : snapshot_(new proto::Snapshot()) { 55 | // When starting from scratch populate the list with a dummy entry at term zero. 56 | proto::EntryPtr entry(new proto::Entry()); 57 | entries_.emplace_back(std::move(entry)); 58 | } 59 | 60 | virtual Status initial_state(proto::HardState& hard_state, proto::ConfState& conf_state); 61 | 62 | void set_hard_state(proto::HardState& hard_state); 63 | 64 | virtual Status entries(uint64_t low, 65 | uint64_t high, 66 | uint64_t max_size, 67 | std::vector& entries); 68 | 69 | virtual Status term(uint64_t i, uint64_t& term); 70 | 71 | virtual Status last_index(uint64_t& index); 72 | 73 | virtual Status first_index(uint64_t& index); 74 | 75 | virtual Status snapshot(proto::SnapshotPtr& snapshot); 76 | 77 | // compact discards all log entries prior to compact_index. 78 | // It is the application's responsibility to not attempt to compact an index 79 | // greater than raftLog.applied. 80 | Status compact(uint64_t compact_index); 81 | 82 | // append the new entries to storage. 83 | Status append(std::vector entries); 84 | 85 | // create_snapshot makes a snapshot which can be retrieved with Snapshot() and 86 | // can be used to reconstruct the state at that point. 87 | // If any configuration changes have been made since the last compaction, 88 | // the result of the last apply_conf_change must be passed in. 89 | Status create_snapshot(uint64_t index, 90 | proto::ConfStatePtr cs, 91 | std::vector data, 92 | proto::SnapshotPtr& snapshot); 93 | 94 | // ApplySnapshot overwrites the contents of this Storage object with 95 | // those of the given snapshot. 96 | Status apply_snapshot(const proto::Snapshot& snapshot); 97 | public: 98 | Status last_index_impl(uint64_t& index); 99 | Status first_index_impl(uint64_t& index); 100 | 101 | std::mutex mutex_; 102 | proto::HardState hard_state_; 103 | proto::SnapshotPtr snapshot_; 104 | // entries_[i] has raft log position i+snapshot.Metadata.Index 105 | std::vector entries_; 106 | }; 107 | typedef std::shared_ptr MemoryStoragePtr; 108 | 109 | } 110 | -------------------------------------------------------------------------------- /raft-kv/raft/progress.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | namespace kv { 5 | 6 | const char* progress_state_to_string(ProgressState state) { 7 | switch (state) { 8 | case ProgressStateProbe: { 9 | return "ProgressStateProbe"; 10 | } 11 | case ProgressStateReplicate: { 12 | return "ProgressStateReplicate"; 13 | } 14 | case ProgressStateSnapshot: { 15 | return "ProgressStateSnapshot"; 16 | } 17 | default: { 18 | LOG_FATAL("unknown state %d", state); 19 | } 20 | } 21 | } 22 | 23 | void InFlights::add(uint64_t inflight) { 24 | if (is_full()) { 25 | LOG_FATAL("cannot add into a full inflights"); 26 | } 27 | 28 | uint64_t next = start + count; 29 | 30 | if (next >= size) { 31 | next -= size; 32 | } 33 | if (next >= buffer.size()) { 34 | uint32_t new_size = buffer.size() * 2; 35 | if (new_size == 0) { 36 | new_size = 1; 37 | } else if (new_size > size) { 38 | new_size = size; 39 | } 40 | buffer.resize(new_size); 41 | } 42 | buffer[next] = inflight; 43 | count++; 44 | } 45 | 46 | void InFlights::free_to(uint64_t to) { 47 | if (count == 0 || to < buffer[start]) { 48 | // out of the left side of the window 49 | return; 50 | } 51 | 52 | uint32_t idx = start; 53 | size_t i; 54 | for (i = 0; i < count; i++) { 55 | if (to < buffer[idx]) { // found the first large inflight 56 | break; 57 | } 58 | 59 | // increase index and maybe rotate 60 | idx++; 61 | 62 | if (idx >= size) { 63 | idx -= size; 64 | } 65 | } 66 | // free i inflights and set new start index 67 | count -= i; 68 | start = idx; 69 | if (count == 0) { 70 | // inflights is empty, reset the start index so that we don't grow the 71 | // buffer unnecessarily. 72 | start = 0; 73 | } 74 | } 75 | 76 | void InFlights::free_first_one() { 77 | free_to(buffer[start]); 78 | } 79 | 80 | void Progress::become_replicate() { 81 | reset_state(ProgressStateReplicate); 82 | next = match + 1; 83 | } 84 | 85 | void Progress::become_probe() { 86 | // If the original state is ProgressStateSnapshot, progress knows that 87 | // the pending snapshot has been sent to this peer successfully, then 88 | // probes from pendingSnapshot + 1. 89 | if (state == ProgressStateSnapshot) { 90 | uint64_t pending = pending_snapshot; 91 | reset_state(ProgressStateProbe); 92 | next = std::max(match + 1, pending + 1); 93 | } else { 94 | reset_state(ProgressStateProbe); 95 | next = match + 1; 96 | } 97 | } 98 | 99 | void Progress::become_snapshot(uint64_t snapshoti) { 100 | reset_state(ProgressStateSnapshot); 101 | pending_snapshot = snapshoti; 102 | } 103 | 104 | void Progress::reset_state(ProgressState st) { 105 | paused = false; 106 | pending_snapshot = 0; 107 | this->state = st; 108 | this->inflights->reset(); 109 | } 110 | 111 | std::string Progress::string() const { 112 | char buffer[256]; 113 | int n = snprintf(buffer, 114 | sizeof(buffer), 115 | "next = %lu, match = %lu, state = %s, waiting = %d, pendingSnapshot = %lu", 116 | next, 117 | match, 118 | progress_state_to_string(state), 119 | is_paused(), 120 | pending_snapshot); 121 | return std::string(buffer, n); 122 | } 123 | 124 | bool Progress::is_paused() const { 125 | switch (state) { 126 | case ProgressStateProbe: { 127 | return paused; 128 | } 129 | case ProgressStateReplicate: { 130 | return inflights->is_full(); 131 | } 132 | case ProgressStateSnapshot: { 133 | return true; 134 | } 135 | default: { 136 | LOG_FATAL("unexpected state"); 137 | } 138 | } 139 | } 140 | 141 | bool Progress::maybe_update(uint64_t n) { 142 | bool updated = false; 143 | if (match < n) { 144 | match = n; 145 | updated = true; 146 | resume(); 147 | } 148 | if (next < n + 1) { 149 | next = n + 1; 150 | } 151 | return updated; 152 | } 153 | 154 | bool Progress::maybe_decreases_to(uint64_t rejected, uint64_t last) { 155 | if (state == ProgressStateReplicate) { 156 | // the rejection must be stale if the progress has matched and "rejected" 157 | // is smaller than "match". 158 | if (rejected <= match) { 159 | return false; 160 | } 161 | // directly decrease next to match + 1 162 | next = match + 1; 163 | return true; 164 | } 165 | 166 | // the rejection must be stale if "rejected" does not match next - 1 167 | if (next - 1 != rejected) { 168 | return false; 169 | } 170 | 171 | next = std::min(rejected, last + 1); 172 | if (next < 1) { 173 | next = 1; 174 | } 175 | resume(); 176 | return true; 177 | } 178 | 179 | bool Progress::need_snapshot_abort() const { 180 | return state == ProgressStateSnapshot && match >= pending_snapshot; 181 | } 182 | 183 | } 184 | -------------------------------------------------------------------------------- /raft-kv/raft/progress.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | namespace kv { 6 | 7 | enum ProgressState { 8 | ProgressStateProbe = 0, 9 | ProgressStateReplicate = 1, 10 | ProgressStateSnapshot = 2 11 | }; 12 | 13 | const char* progress_state_to_string(ProgressState state); 14 | 15 | class InFlights { 16 | public: 17 | explicit InFlights(uint64_t max_inflight_msgs) 18 | : start(0), 19 | count(0), 20 | size(static_cast(max_inflight_msgs)) {} 21 | 22 | void reset() { 23 | start = 0; 24 | count = 0; 25 | } 26 | 27 | bool is_full() const { 28 | return size == count; 29 | } 30 | 31 | void add(uint64_t inflight); 32 | 33 | // freeTo frees the inflights smaller or equal to the given `to` flight. 34 | void free_to(uint64_t to); 35 | 36 | void free_first_one(); 37 | 38 | // the starting index in the buffer 39 | uint32_t start; 40 | // number of inflights in the buffer 41 | uint32_t count; 42 | 43 | // the size of the buffer 44 | uint32_t size; 45 | 46 | // ring buffer contains the index of the last entry 47 | // inside one message. 48 | std::vector buffer; 49 | }; 50 | 51 | // Progress represents a follower’s progress in the view of the leader. Leader maintains 52 | // progresses of all followers, and sends entries to the follower based on its progress. 53 | class Progress { 54 | public: 55 | explicit Progress(uint64_t max_inflight) 56 | : match(0), 57 | next(0), 58 | state(ProgressState::ProgressStateProbe), 59 | paused(false), 60 | pending_snapshot(0), 61 | recent_active(false), 62 | inflights(new InFlights(max_inflight)), 63 | is_learner(false) { 64 | 65 | } 66 | 67 | void become_replicate(); 68 | 69 | void become_probe(); 70 | 71 | void become_snapshot(uint64_t snapshoti); 72 | 73 | void reset_state(ProgressState state); 74 | 75 | std::string string() const; 76 | 77 | bool is_paused() const; 78 | 79 | void set_pause() { 80 | this->paused = true; 81 | } 82 | 83 | void resume() { 84 | this->paused = false; 85 | } 86 | 87 | // maybe_update returns false if the given n index comes from an outdated message. 88 | // Otherwise it updates the progress and returns true. 89 | bool maybe_update(uint64_t n); 90 | 91 | void optimistic_update(uint64_t n) { 92 | next = n + 1; 93 | } 94 | 95 | // maybe_decr_to returns false if the given to index comes from an out of order message. 96 | // Otherwise it decreases the progress next index to min(rejected, last) and returns true. 97 | bool maybe_decreases_to(uint64_t rejected, uint64_t last); 98 | 99 | // need_snapshot_abort returns true if snapshot progress's match 100 | // is equal or higher than the pending_snapshot. 101 | bool need_snapshot_abort() const; 102 | 103 | void snapshot_failure() { 104 | pending_snapshot = 0; 105 | } 106 | 107 | uint64_t match; 108 | uint64_t next; 109 | // state defines how the leader should interact with the follower. 110 | // 111 | // When in ProgressStateProbe, leader sends at most one replication message 112 | // per heartbeat interval. It also probes actual progress of the follower. 113 | // 114 | // When in ProgressStateReplicate, leader optimistically increases next 115 | // to the latest entry sent after sending replication message. This is 116 | // an optimized state for fast replicating log entries to the follower. 117 | // 118 | // When in ProgressStateSnapshot, leader should have sent out snapshot 119 | // before and stops sending any replication message. 120 | ProgressState state; 121 | 122 | // paused is used in ProgressStateProbe. 123 | // When Paused is true, raft should pause sending replication message to this peer. 124 | bool paused; 125 | // pending_snapshot is used in ProgressStateSnapshot. 126 | // If there is a pending snapshot, the pendingSnapshot will be set to the 127 | // index of the snapshot. If pendingSnapshot is set, the replication process of 128 | // this Progress will be paused. raft will not resend snapshot until the pending one 129 | // is reported to be failed. 130 | uint64_t pending_snapshot; 131 | 132 | // recent_active is true if the progress is recently active. Receiving any messages 133 | // from the corresponding follower indicates the progress is active. 134 | // recent_active can be reset to false after an election timeout. 135 | bool recent_active; 136 | 137 | 138 | // inflights is a sliding window for the inflight messages. 139 | // Each inflight message contains one or more log entries. 140 | // The max number of entries per message is defined in raft config as MaxSizePerMsg. 141 | // Thus inflight effectively limits both the number of inflight messages 142 | // and the bandwidth each Progress can use. 143 | // When inflights is full, no more message should be sent. 144 | // When a leader sends out a message, the index of the last 145 | // entry should be added to inflights. The index MUST be added 146 | // into inflights in order. 147 | // When a leader receives a reply, the previous inflights should 148 | // be freed by calling inflights.freeTo with the index of the last 149 | // received entry. 150 | 151 | std::shared_ptr inflights; 152 | 153 | // is_learner is true if this progress is tracked for a learner. 154 | bool is_learner; 155 | 156 | }; 157 | typedef std::shared_ptr ProgressPtr; 158 | 159 | } -------------------------------------------------------------------------------- /raft-kv/raft/raft_log.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | namespace kv { 6 | 7 | class RaftLog { 8 | public: 9 | explicit RaftLog(StoragePtr storage, uint64_t max_next_ents_size); 10 | 11 | ~RaftLog(); 12 | 13 | static uint64_t unlimited() { 14 | return std::numeric_limits::max(); 15 | } 16 | 17 | std::string status_string() const { 18 | char buffer[64]; 19 | int n = snprintf(buffer, 20 | sizeof(buffer), 21 | "committed=%lu, applied=%lu, unstable.offset=%lu, unstable.entries=%lu", 22 | committed_, 23 | applied_, 24 | unstable_->offset_, 25 | unstable_->entries_.size()); 26 | return std::string(buffer, n); 27 | } 28 | 29 | // maybe_append returns (0, false) if the entries cannot be appended. O therwise, 30 | // it returns (last index of new entries, true). 31 | void maybe_append(uint64_t index, 32 | uint64_t log_term, 33 | uint64_t committed, 34 | std::vector entries, 35 | uint64_t& last_new_index, 36 | bool& ok); 37 | 38 | // return last index 39 | uint64_t append(std::vector entries); 40 | 41 | // find_conflict finds the index of the conflict. 42 | // It returns the first pair of conflicting entries between the existing 43 | // entries and the given entries, if there are any. 44 | // If there is no conflicting entries, and the existing entries contains 45 | // all the given entries, zero will be returned. 46 | // If there is no conflicting entries, but the given entries contains new 47 | // entries, the index of the first new entry will be returned. 48 | // An entry is considered to be conflicting if it has the same index but 49 | // a different term. 50 | // The first entry MUST have an index equal to the argument 'from'. 51 | // The index of the given entries MUST be continuously increasing. 52 | uint64_t find_conflict(const std::vector& entries); 53 | 54 | // next_entries returns all the available entries for execution. 55 | // If applied is smaller than the index of snapshot, it returns all committed 56 | // entries after the index of snapshot. 57 | void next_entries(std::vector& entries) const; 58 | 59 | // has_next_entries returns if there is any available entries for execution. This 60 | // is a fast check without heavy slice in next_entries. 61 | bool has_next_entries() const; 62 | 63 | // slice returns a slice of log entries from low through high-1, inclusive. 64 | Status slice(uint64_t low, uint64_t high, uint64_t max_size, std::vector& entries) const; 65 | 66 | // is_up_to_date determines if the given (lastIndex,term) log is more up-to-date 67 | // by comparing the index and term of the last entries in the existing logs. 68 | // If the logs have last entries with different terms, then the log with the 69 | // later term is more up-to-date. If the logs end with the same term, then 70 | // whichever log has the larger lastIndex is more up-to-date. If the logs are 71 | // the same, the given log is up-to-date. 72 | bool is_up_to_date(uint64_t lasti, uint64_t term) const { 73 | uint64_t lt = last_term(); 74 | return term > lt || (term == lt && lasti >= last_index()); 75 | } 76 | 77 | std::vector& unstable_entries() { 78 | return unstable_->entries_; 79 | } 80 | 81 | bool maybe_commit(uint64_t max_index, uint64_t term); 82 | 83 | void restore(proto::SnapshotPtr snapshot); 84 | 85 | Status snapshot(proto::SnapshotPtr& snap) const; 86 | 87 | void applied_to(uint64_t index); 88 | 89 | void stable_to(uint64_t index, uint64_t term) { 90 | unstable_->stable_to(index, term); 91 | } 92 | 93 | void stable_snap_to(uint64_t index) { 94 | unstable_->stable_snap_to(index); 95 | } 96 | 97 | Status entries(uint64_t index, uint64_t max_size, std::vector& entries) const { 98 | if (index > last_index()) { 99 | return Status::ok(); 100 | } 101 | return slice(index, last_index() + 1, max_size, entries); 102 | } 103 | 104 | void commit_to(uint64_t to_commit); 105 | 106 | bool match_term(uint64_t index, uint64_t t); 107 | 108 | uint64_t last_term() const; 109 | 110 | Status term(uint64_t index, uint64_t& t) const; 111 | 112 | uint64_t first_index() const; 113 | 114 | uint64_t last_index() const; 115 | 116 | Status must_check_out_of_bounds(uint64_t low, uint64_t high) const; 117 | 118 | void all_entries(std::vector& entries); 119 | 120 | public: 121 | // storage contains all stable entries since the last snapshot. 122 | StoragePtr storage_; 123 | 124 | // unstable contains all unstable entries and snapshot. 125 | // they will be saved into storage. 126 | UnstablePtr unstable_; 127 | 128 | // committed is the highest log position that is known to be in 129 | // stable storage on a quorum of nodes. 130 | uint64_t committed_; 131 | // applied is the highest log position that the application has 132 | // been instructed to apply to its state machine. 133 | // Invariant: applied <= committed 134 | uint64_t applied_; 135 | 136 | // max_next_ents_size is the maximum number aggregate byte size of the messages 137 | // returned from calls to nextEnts. 138 | uint64_t max_next_ents_size_; 139 | }; 140 | typedef std::shared_ptr RaftLogPtr; 141 | 142 | } 143 | -------------------------------------------------------------------------------- /raft-kv/transport/peer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace kv { 9 | 10 | class PeerImpl; 11 | class ClientSession { 12 | public: 13 | explicit ClientSession(boost::asio::io_service& io_service, PeerImpl* peer); 14 | 15 | ~ClientSession() { 16 | 17 | } 18 | 19 | void send(uint8_t transport_type, const uint8_t* data, uint32_t len) { 20 | uint32_t remaining = buffer_.readable_bytes(); 21 | 22 | TransportMeta meta; 23 | meta.type = transport_type; 24 | meta.len = htonl(len); 25 | assert(sizeof(TransportMeta) == 5); 26 | buffer_.put((const uint8_t*) &meta, sizeof(TransportMeta)); 27 | buffer_.put(data, len); 28 | assert(remaining + sizeof(TransportMeta) + len == buffer_.readable_bytes()); 29 | 30 | if (connected_ && remaining == 0) { 31 | start_write(); 32 | } 33 | } 34 | 35 | void close_session(); 36 | 37 | void start_connect() { 38 | socket_.async_connect(endpoint_, [this](const boost::system::error_code& err) { 39 | if (err) { 40 | LOG_DEBUG("connect [%lu] error %s", this->peer_id_, err.message().c_str()); 41 | this->close_session(); 42 | return; 43 | } 44 | this->connected_ = true; 45 | LOG_INFO("connected to [%lu]", this->peer_id_); 46 | 47 | if (this->buffer_.readable()) { 48 | this->start_write(); 49 | } 50 | }); 51 | } 52 | 53 | void start_write() { 54 | if (!buffer_.readable()) { 55 | return; 56 | } 57 | 58 | uint32_t remaining = buffer_.readable_bytes(); 59 | auto buffer = boost::asio::buffer(buffer_.reader(), remaining); 60 | auto handler = [this](const boost::system::error_code& error, std::size_t bytes) { 61 | if (error || bytes == 0) { 62 | LOG_DEBUG("send [%lu] error %s", this->peer_id_, error.message().c_str()); 63 | this->close_session(); 64 | return; 65 | } 66 | this->buffer_.read_bytes(bytes); 67 | this->start_write(); 68 | }; 69 | boost::asio::async_write(socket_, buffer, handler); 70 | } 71 | 72 | private: 73 | boost::asio::ip::tcp::socket socket_; 74 | boost::asio::ip::tcp::endpoint endpoint_; 75 | PeerImpl* peer_; 76 | uint64_t peer_id_; 77 | ByteBuffer buffer_; 78 | bool connected_; 79 | }; 80 | 81 | class PeerImpl : public Peer { 82 | public: 83 | explicit PeerImpl(boost::asio::io_service& io_service, uint64_t peer, const std::string& peer_str) 84 | : peer_(peer), 85 | io_service_(io_service), 86 | timer_(io_service) { 87 | std::vector strs; 88 | boost::split(strs, peer_str, boost::is_any_of(":")); 89 | if (strs.size() != 2) { 90 | LOG_DEBUG("invalid host %s", peer_str.c_str()); 91 | exit(0); 92 | } 93 | auto address = boost::asio::ip::address::from_string(strs[0]); 94 | int port = std::atoi(strs[1].c_str()); 95 | endpoint_ = boost::asio::ip::tcp::endpoint(address, port); 96 | } 97 | 98 | ~PeerImpl() final { 99 | } 100 | 101 | void start() final { 102 | start_timer(); 103 | }; 104 | 105 | void send(proto::MessagePtr msg) final { 106 | msgpack::sbuffer sbuf; 107 | msgpack::pack(sbuf, *msg); 108 | 109 | do_send_data(TransportTypeStream, (const uint8_t*) sbuf.data(), (uint32_t) sbuf.size()); 110 | } 111 | 112 | void send_snap(proto::SnapshotPtr snap) final { 113 | LOG_DEBUG("no impl yet"); 114 | } 115 | 116 | void update(const std::string& peer) final { 117 | LOG_DEBUG("no impl yet"); 118 | } 119 | 120 | uint64_t active_since() final { 121 | LOG_DEBUG("no impl yet"); 122 | return 0; 123 | } 124 | 125 | void stop() final { 126 | 127 | } 128 | 129 | private: 130 | void do_send_data(uint8_t type, const uint8_t* data, uint32_t len) { 131 | if (!session_) { 132 | session_ = std::make_shared(io_service_, this); 133 | session_->send(type, data, len); 134 | session_->start_connect(); 135 | } else { 136 | session_->send(type, data, len); 137 | } 138 | } 139 | 140 | void start_timer() { 141 | timer_.expires_from_now(boost::posix_time::seconds(3)); 142 | timer_.async_wait([this](const boost::system::error_code& err) { 143 | if (err) { 144 | LOG_ERROR("timer waiter error %s", err.message().c_str()); 145 | return; 146 | } 147 | this->start_timer(); 148 | }); 149 | 150 | static std::atomic tick; 151 | DebugMessage dbg; 152 | dbg.a = tick++; 153 | dbg.b = tick++; 154 | do_send_data(TransportTypeDebug, (const uint8_t*) &dbg, sizeof(dbg)); 155 | } 156 | 157 | uint64_t peer_; 158 | boost::asio::io_service& io_service_; 159 | friend class ClientSession; 160 | std::shared_ptr session_; 161 | boost::asio::ip::tcp::endpoint endpoint_; 162 | boost::asio::deadline_timer timer_; 163 | }; 164 | 165 | ClientSession::ClientSession(boost::asio::io_service& io_service, PeerImpl* peer) 166 | : socket_(io_service), 167 | endpoint_(peer->endpoint_), 168 | peer_(peer), 169 | peer_id_(peer_->peer_), 170 | connected_(false) { 171 | 172 | } 173 | 174 | void ClientSession::close_session() { 175 | peer_->session_ = nullptr; 176 | } 177 | 178 | std::shared_ptr Peer::creat(uint64_t peer, const std::string& peer_str, void* io_service) { 179 | std::shared_ptr peer_ptr(new PeerImpl(*(boost::asio::io_service*) io_service, peer, peer_str)); 180 | return peer_ptr; 181 | } 182 | 183 | } 184 | -------------------------------------------------------------------------------- /raft-kv/transport/raft_server.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace kv { 9 | 10 | class AsioServer; 11 | class ServerSession : public std::enable_shared_from_this { 12 | public: 13 | explicit ServerSession(boost::asio::io_service& io_service, AsioServer* server) 14 | : socket(io_service), 15 | server_(server) { 16 | 17 | } 18 | 19 | void start_read_meta() { 20 | assert(sizeof(meta_) == 5); 21 | meta_.type = 0; 22 | meta_.len = 0; 23 | auto self = shared_from_this(); 24 | auto buffer = boost::asio::buffer(&meta_, sizeof(meta_)); 25 | auto handler = [self](const boost::system::error_code& error, std::size_t bytes) { 26 | if (bytes == 0) { 27 | return;; 28 | } 29 | if (error) { 30 | LOG_DEBUG("read error %s", error.message().c_str()); 31 | return; 32 | } 33 | 34 | if (bytes != sizeof(meta_)) { 35 | LOG_DEBUG("invalid data len %lu", bytes); 36 | return; 37 | } 38 | self->start_read_message(); 39 | }; 40 | 41 | boost::asio::async_read(socket, buffer, boost::asio::transfer_exactly(sizeof(meta_)), handler); 42 | } 43 | 44 | void start_read_message() { 45 | uint32_t len = ntohl(meta_.len); 46 | if (buffer_.capacity() < len) { 47 | buffer_.resize(len); 48 | } 49 | 50 | auto self = shared_from_this(); 51 | auto buffer = boost::asio::buffer(buffer_.data(), len); 52 | auto handler = [self, len](const boost::system::error_code& error, std::size_t bytes) { 53 | assert(len == ntohl(self->meta_.len)); 54 | if (error || bytes == 0) { 55 | LOG_DEBUG("read error %s", error.message().c_str()); 56 | return; 57 | } 58 | 59 | if (bytes != len) { 60 | LOG_DEBUG("invalid data len %lu, %u", bytes, len); 61 | return; 62 | } 63 | self->decode_message(len); 64 | }; 65 | boost::asio::async_read(socket, buffer, boost::asio::transfer_exactly(len), handler); 66 | } 67 | 68 | void decode_message(uint32_t len) { 69 | switch (meta_.type) { 70 | case TransportTypeDebug: { 71 | assert(len == sizeof(DebugMessage)); 72 | DebugMessage* dbg = (DebugMessage*) buffer_.data(); 73 | assert(dbg->a + 1 == dbg->b); 74 | //LOG_DEBUG("tick ok"); 75 | break; 76 | } 77 | case TransportTypeStream: { 78 | proto::MessagePtr msg(new proto::Message()); 79 | try { 80 | msgpack::object_handle oh = msgpack::unpack((const char*) buffer_.data(), len); 81 | oh.get().convert(*msg); 82 | } 83 | catch (std::exception& e) { 84 | LOG_ERROR("bad message %s, size = %lu, type %s", 85 | e.what(), 86 | buffer_.size(), 87 | proto::msg_type_to_string(msg->type)); 88 | return; 89 | } 90 | on_receive_stream_message(std::move(msg)); 91 | break; 92 | } 93 | default: { 94 | LOG_DEBUG("unknown msg type %d, len = %d", meta_.type, ntohl(meta_.len)); 95 | return; 96 | } 97 | } 98 | 99 | start_read_meta(); 100 | } 101 | 102 | void on_receive_stream_message(proto::MessagePtr msg); 103 | 104 | boost::asio::ip::tcp::socket socket; 105 | private: 106 | AsioServer* server_; 107 | TransportMeta meta_; 108 | std::vector buffer_; 109 | }; 110 | typedef std::shared_ptr ServerSessionPtr; 111 | 112 | class AsioServer : public IoServer { 113 | public: 114 | explicit AsioServer(boost::asio::io_service& io_service, 115 | const std::string& host, 116 | RaftServer* raft) 117 | : io_service_(io_service), 118 | acceptor_(io_service), 119 | raft_(raft) { 120 | std::vector strs; 121 | boost::split(strs, host, boost::is_any_of(":")); 122 | if (strs.size() != 2) { 123 | LOG_DEBUG("invalid host %s", host.c_str()); 124 | exit(0); 125 | } 126 | auto address = boost::asio::ip::address::from_string(strs[0]); 127 | int port = std::atoi(strs[1].c_str()); 128 | auto endpoint = boost::asio::ip::tcp::endpoint(address, port); 129 | 130 | acceptor_.open(endpoint.protocol()); 131 | acceptor_.set_option(boost::asio::ip::tcp::acceptor::reuse_address(1)); 132 | acceptor_.bind(endpoint); 133 | acceptor_.listen(); 134 | LOG_DEBUG("listen at %s:%d", address.to_string().c_str(), port); 135 | } 136 | 137 | ~AsioServer() { 138 | 139 | } 140 | 141 | void start() final { 142 | ServerSessionPtr session(new ServerSession(io_service_, this)); 143 | acceptor_.async_accept(session->socket, [this, session](const boost::system::error_code& error) { 144 | if (error) { 145 | LOG_DEBUG("accept error %s", error.message().c_str()); 146 | return; 147 | } 148 | 149 | this->start(); 150 | session->start_read_meta(); 151 | }); 152 | } 153 | 154 | void stop() final { 155 | 156 | } 157 | 158 | void on_message(proto::MessagePtr msg) { 159 | raft_->process(std::move(msg), [](const Status& status) { 160 | if (!status.is_ok()) { 161 | LOG_ERROR("process error %s", status.to_string().c_str()); 162 | } 163 | }); 164 | } 165 | 166 | private: 167 | boost::asio::io_service& io_service_; 168 | boost::asio::ip::tcp::acceptor acceptor_; 169 | RaftServer* raft_; 170 | }; 171 | 172 | void ServerSession::on_receive_stream_message(proto::MessagePtr msg) { 173 | server_->on_message(std::move(msg)); 174 | } 175 | 176 | std::shared_ptr IoServer::create(void* io_service, 177 | const std::string& host, 178 | RaftServer* raft) { 179 | std::shared_ptr server(new AsioServer(*(boost::asio::io_service*) io_service, host, raft)); 180 | return server; 181 | } 182 | 183 | } 184 | -------------------------------------------------------------------------------- /raft-kv/raft/proto.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | 6 | namespace kv { 7 | 8 | namespace proto { 9 | 10 | typedef uint8_t MessageType; 11 | 12 | const MessageType MsgHup = 0; 13 | const MessageType MsgBeat = 1; 14 | const MessageType MsgProp = 2; 15 | const MessageType MsgApp = 3; 16 | const MessageType MsgAppResp = 4; 17 | const MessageType MsgVote = 5; 18 | const MessageType MsgVoteResp = 6; 19 | const MessageType MsgSnap = 7; 20 | const MessageType MsgHeartbeat = 8; 21 | const MessageType MsgHeartbeatResp = 9; 22 | const MessageType MsgUnreachable = 10; 23 | const MessageType MsgSnapStatus = 11; 24 | const MessageType MsgCheckQuorum = 12; 25 | const MessageType MsgTransferLeader = 13; 26 | const MessageType MsgTimeoutNow = 14; 27 | const MessageType MsgReadIndex = 15; 28 | const MessageType MsgReadIndexResp = 16; 29 | const MessageType MsgPreVote = 17; 30 | const MessageType MsgPreVoteResp = 18; 31 | 32 | const MessageType MsgTypeSize = 19; 33 | 34 | const char* msg_type_to_string(MessageType type); 35 | 36 | typedef uint8_t EntryType; 37 | 38 | const EntryType EntryNormal = 0; 39 | const EntryType EntryConfChange = 1; 40 | 41 | const char* entry_type_to_string(EntryType type); 42 | 43 | struct Entry { 44 | Entry() 45 | : type(EntryNormal), 46 | term(0), 47 | index(0) {} 48 | 49 | explicit Entry(Entry&& entry) 50 | : type(entry.type), 51 | term(entry.term), 52 | index(entry.index), 53 | data(std::move(entry.data)) { 54 | 55 | } 56 | 57 | kv::proto::Entry& operator=(const kv::proto::Entry& entry) = default; 58 | Entry(const Entry& entry) = default; 59 | 60 | explicit Entry(EntryType type, uint64_t term, uint64_t index, std::vector data) 61 | : type(type), 62 | term(term), 63 | index(index), 64 | data(std::move(data)) {} 65 | 66 | uint32_t serialize_size() const; 67 | 68 | uint32_t payload_size() const { 69 | return static_cast(data.size()); 70 | } 71 | 72 | bool operator==(const Entry& entry) const { 73 | return type == entry.type && term == entry.term && index == entry.index && data == entry.data; 74 | } 75 | bool operator!=(const Entry& entry) const { 76 | return !(*this == entry); 77 | } 78 | 79 | EntryType type; 80 | uint64_t term; 81 | uint64_t index; 82 | std::vector data; 83 | MSGPACK_DEFINE (type, term, index, data); 84 | }; 85 | typedef std::shared_ptr EntryPtr; 86 | 87 | struct ConfState { 88 | bool operator==(const ConfState& cs) const { 89 | return nodes == cs.nodes && learners == cs.learners; 90 | } 91 | 92 | std::vector nodes; 93 | std::vector learners; 94 | MSGPACK_DEFINE (nodes, learners); 95 | }; 96 | typedef std::shared_ptr ConfStatePtr; 97 | 98 | struct SnapshotMetadata { 99 | SnapshotMetadata() 100 | : index(0), 101 | term(0) { 102 | } 103 | 104 | bool operator==(const SnapshotMetadata& meta) const { 105 | return conf_state == meta.conf_state && index == meta.index && term == meta.term; 106 | } 107 | 108 | ConfState conf_state; 109 | uint64_t index; 110 | uint64_t term; 111 | MSGPACK_DEFINE (conf_state, index, term); 112 | }; 113 | 114 | struct Snapshot { 115 | Snapshot() = default; 116 | 117 | explicit Snapshot(const std::vector& data) 118 | : data(data) { 119 | } 120 | 121 | bool equal(const Snapshot& snap) const; 122 | 123 | bool is_empty() const { 124 | return metadata.index == 0; 125 | } 126 | std::vector data; 127 | SnapshotMetadata metadata; 128 | MSGPACK_DEFINE (data, metadata); 129 | }; 130 | typedef std::shared_ptr SnapshotPtr; 131 | 132 | struct Message { 133 | Message() 134 | : type(MsgHup), 135 | to(0), 136 | from(0), 137 | term(0), 138 | log_term(0), 139 | index(0), 140 | commit(0), 141 | reject(false), 142 | reject_hint(0) { 143 | 144 | } 145 | 146 | bool operator==(const Message& msg) const { 147 | return type == msg.type && to == msg.to && from == msg.from && term == msg.term 148 | && log_term == msg.log_term && index == msg.index 149 | && entries == msg.entries && commit == msg.commit 150 | && snapshot.equal(msg.snapshot) && reject == msg.reject 151 | && reject_hint == msg.reject_hint && context == msg.context; 152 | } 153 | 154 | bool is_local_msg() const; 155 | 156 | bool is_response_msg() const; 157 | 158 | MessageType type; 159 | uint64_t to; 160 | uint64_t from; 161 | uint64_t term; 162 | uint64_t log_term; 163 | uint64_t index; 164 | std::vector entries; 165 | uint64_t commit; 166 | Snapshot snapshot; 167 | bool reject; 168 | uint64_t reject_hint; 169 | std::vector context; 170 | MSGPACK_DEFINE (type, to, from, term, log_term, index, entries, commit, snapshot, reject, reject_hint, context); 171 | }; 172 | typedef std::shared_ptr MessagePtr; 173 | 174 | struct HardState { 175 | HardState() 176 | : term(0), 177 | vote(0), 178 | commit(0) { 179 | } 180 | 181 | bool is_empty_state() const { 182 | return term == 0 && vote == 0 && commit == 0; 183 | } 184 | 185 | bool equal(const HardState& hs) const { 186 | return term == hs.term && vote == hs.vote && commit == hs.commit; 187 | } 188 | 189 | uint64_t term; 190 | uint64_t vote; 191 | uint64_t commit; 192 | MSGPACK_DEFINE (term, vote, commit); 193 | }; 194 | 195 | const uint8_t ConfChangeAddNode = 0; 196 | const uint8_t ConfChangeRemoveNode = 1; 197 | const uint8_t ConfChangeUpdateNode = 2; 198 | const uint8_t ConfChangeAddLearnerNode = 3; 199 | 200 | struct ConfChange { 201 | static void from_data(const std::vector& data, ConfChange& cc); 202 | uint64_t id; 203 | uint8_t conf_change_type; 204 | uint64_t node_id; 205 | std::vector context; 206 | MSGPACK_DEFINE (id, conf_change_type, node_id, context); 207 | std::vector serialize() const; 208 | }; 209 | typedef std::shared_ptr ConfChangePtr; 210 | 211 | } 212 | } -------------------------------------------------------------------------------- /raft-kv/raft/config.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | 6 | namespace kv { 7 | 8 | enum ReadOnlyOption { 9 | // ReadOnlySafe guarantees the linearizability of the read only request by 10 | // communicating with the quorum. It is the default and suggested option. 11 | ReadOnlySafe = 0, 12 | 13 | // ReadOnlyLeaseBased ensures linearizability of the read only request by 14 | // relying on the leader lease. It can be affected by clock drift. 15 | // If the clock drift is unbounded, leader might keep the lease longer than it 16 | // should (clock can move backward/pause without any bound). ReadIndex is not safe 17 | // in that case. 18 | ReadOnlyLeaseBased = 1, 19 | }; 20 | 21 | // Config contains the parameters to start a raft. 22 | struct Config { 23 | explicit Config() 24 | : id(0), 25 | election_tick(0), 26 | heartbeat_tick(0), 27 | applied(0), 28 | max_size_per_msg(0), 29 | max_committed_size_per_ready(0), 30 | max_uncommitted_entries_size(0), 31 | max_inflight_msgs(0), 32 | check_quorum(false), 33 | pre_vote(false), 34 | read_only_option(ReadOnlySafe), 35 | disable_proposal_forwarding(false) {} 36 | 37 | // id is the identity of the local raft. ID cannot be 0. 38 | uint64_t id; 39 | 40 | // peers contains the IDs of all nodes (including self) in the raft cluster. It 41 | // should only be set when starting a new raft cluster. Restarting raft from 42 | // previous configuration will panic if peers is set. peer is private and only 43 | // used for testing right now. 44 | std::vector peers; 45 | 46 | // learners contains the IDs of all learner nodes (including self if the 47 | // local node is a learner) in the raft cluster. learners only receives 48 | // entries from the leader node. It does not vote or promote itself. 49 | std::vector learners; 50 | 51 | // election_tick is the number of Node.tick invocations that must pass between 52 | // elections. That is, if a follower does not receive any message from the 53 | // leader of current term before election_tick has elapsed, it will become 54 | // candidate and start an election. election_tick must be greater than 55 | // heartbeat_tick. We suggest election_tick = 10 * heartbeat_tick to avoid 56 | // unnecessary leader switching. 57 | uint32_t election_tick; 58 | // heartbeat_tick is the number of Node.tick invocations that must pass between 59 | // heartbeats. That is, a leader sends heartbeat messages to maintain its 60 | // leadership every heartbeat_tick ticks. 61 | uint32_t heartbeat_tick; 62 | 63 | // storage is the storage for raft. raft generates entries and states to be 64 | // stored in storage. raft reads the persisted entries and states out of 65 | // Storage when it needs. raft reads out the previous state and configuration 66 | // out of storage when restarting. 67 | StoragePtr storage; 68 | // applied is the last applied index. It should only be set when restarting 69 | // raft. raft will not return entries to the application smaller or equal to 70 | // Applied. If Applied is unset when restarting, raft might return previous 71 | // applied entries. This is a very application dependent configuration. 72 | uint64_t applied; 73 | 74 | // max_size_per_msg limits the max byte size of each append message. Smaller 75 | // value lowers the raft recover cost(initial probing and message lost 76 | // during normal operation). On the other side, it might affect the 77 | // throughput during normal replication. Note: math.MaxUint64 for unlimited, 78 | // 0 for at most one entry per message. 79 | uint64_t max_size_per_msg; 80 | // max_committed_size_per_ready limits the size of the committed entries which 81 | // can be applied. 82 | uint64_t max_committed_size_per_ready; 83 | // max_uncommitted_entries_size limits the aggregate byte size of the 84 | // uncommitted entries that may be appended to a leader's log. Once this 85 | // limit is exceeded, proposals will begin to return ErrProposalDropped 86 | // errors. Note: 0 for no limit. 87 | uint64_t max_uncommitted_entries_size; 88 | // max_inflight_msgs limits the max number of in-flight append messages during 89 | // optimistic replication phase. The application transportation layer usually 90 | // has its own sending buffer over TCP/UDP. Setting MaxInflightMsgs to avoid 91 | // overflowing that sending buffer. 92 | uint64_t max_inflight_msgs; 93 | 94 | // check_quorum specifies if the leader should check quorum activity. Leader 95 | // steps down when quorum is not active for an election_timeout. 96 | bool check_quorum; 97 | 98 | // pre_vote enables the Pre-Vote algorithm described in raft thesis section 99 | // 9.6. This prevents disruption when a node that has been partitioned away 100 | // rejoins the cluster. 101 | bool pre_vote; 102 | 103 | // read_only_option specifies how the read only request is processed. 104 | // 105 | // ReadOnlySafe guarantees the linearizability of the read only request by 106 | // communicating with the quorum. It is the default and suggested option. 107 | // 108 | // ReadOnlyLeaseBased ensures linearizability of the read only request by 109 | // relying on the leader lease. It can be affected by clock drift. 110 | // If the clock drift is unbounded, leader might keep the lease longer than it 111 | // should (clock can move backward/pause without any bound). read_index is not safe 112 | // in that case. 113 | // CheckQuorum MUST be enabled if ReadOnlyOption is ReadOnlyLeaseBased. 114 | ReadOnlyOption read_only_option; 115 | 116 | // disable_proposal_forwarding set to true means that followers will drop 117 | // proposals, rather than forwarding them to the leader. One use case for 118 | // this feature would be in a situation where the Raft leader is used to 119 | // compute the data of a proposal, for example, adding a timestamp from a 120 | // hybrid logical clock to data in a monotonically increasing way. Forwarding 121 | // should be disabled to prevent a follower with an inaccurate hybrid 122 | // logical clock from assigning the timestamp and then forwarding the data 123 | // to the leader. 124 | bool disable_proposal_forwarding; 125 | 126 | Status validate(); 127 | }; 128 | 129 | } -------------------------------------------------------------------------------- /raft-kv/raft/node.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace kv { 8 | 9 | typedef uint8_t SnapshotStatus; 10 | 11 | static const SnapshotStatus SnapshotFinish = 1; 12 | static const SnapshotStatus SnapshotFailure = 2; 13 | 14 | struct PeerContext { 15 | uint64_t id; 16 | std::vector context; 17 | }; 18 | 19 | class Node { 20 | public: 21 | ~Node() = default; 22 | 23 | // tick increments the internal logical clock for the Node by a single tick. Election 24 | // timeouts and heartbeat timeouts are in units of ticks. 25 | virtual void tick() = 0; 26 | 27 | // campaign causes the Node to transition to candidate state and start campaigning to become leader. 28 | virtual Status campaign() = 0; 29 | 30 | // propose proposes that data be appended to the log. Note that proposals can be lost without 31 | // notice, therefore it is user's job to ensure proposal retries. 32 | virtual Status propose(std::vector data) = 0; 33 | 34 | // propose_conf_change proposes config change. 35 | // At most one ConfChange can be in the process of going through consensus. 36 | // Application needs to call apply_conf_change when applying EntryConfChange type entry. 37 | virtual Status propose_conf_change(const proto::ConfChange& cc) = 0; 38 | 39 | // step advances the state machine using the given message. ctx.Err() will be returned, if any. 40 | virtual Status step(proto::MessagePtr msg) = 0; 41 | 42 | // ready returns the current point-in-time state of this RawNode. 43 | virtual ReadyPtr ready() = 0; 44 | 45 | // has_ready called when RawNode user need to check if any Ready pending. 46 | // Checking logic in this method should be consistent with Ready.containsUpdates(). 47 | virtual bool has_ready() = 0; 48 | 49 | // advance notifies the Node that the application has saved progress up to the last ready. 50 | // It prepares the node to return the next available ready. 51 | // 52 | // The application should generally call advance after it applies the entries in last ready. 53 | // 54 | // However, as an optimization, the application may call advance while it is applying the 55 | // commands. For example. when the last ready contains a snapshot, the application might take 56 | // a long time to apply the snapshot data. To continue receiving ready without blocking raft 57 | // progress, it can call advance before finishing applying the last ready. 58 | virtual void advance(ReadyPtr ready) = 0; 59 | 60 | // apply_conf_change applies config change to the local node. 61 | // Returns an opaque ConfState protobuf which must be recorded 62 | // in snapshots. Will never return nil; it returns a pointer only 63 | // to match MemoryStorage.Compact. 64 | virtual proto::ConfStatePtr apply_conf_change(const proto::ConfChange& cc) = 0; 65 | 66 | // transfer_leadership attempts to transfer leadership to the given transferee. 67 | virtual void transfer_leadership(uint64_t lead, ino64_t transferee) = 0; 68 | 69 | // read_index request a read state. The read state will be set in the ready. 70 | // Read state has a read index. Once the application advances further than the read 71 | // index, any linearizable read requests issued before the read request can be 72 | // processed safely. The read state will have the same rctx attached. 73 | virtual Status read_index(std::vector rctx) = 0; 74 | 75 | // raft_status returns the current status of the raft state machine. 76 | virtual RaftStatusPtr raft_status() = 0; 77 | 78 | // report_unreachable reports the given node is not reachable for the last send. 79 | virtual void report_unreachable(uint64_t id) = 0; 80 | 81 | // report_snapshot reports the status of the sent snapshot. The id is the raft ID of the follower 82 | // who is meant to receive the snapshot, and the status is SnapshotFinish or SnapshotFailure. 83 | // Calling report_snapshot with SnapshotFinish is a no-op. But, any failure in applying a 84 | // snapshot (for e.g., while streaming it from leader to follower), should be reported to the 85 | // leader with SnapshotFailure. When leader sends a snapshot to a follower, it pauses any raft 86 | // log probes until the follower can apply the snapshot and advance its state. If the follower 87 | // can't do that, for e.g., due to a crash, it could end up in a limbo, never getting any 88 | // updates from the leader. Therefore, it is crucial that the application ensures that any 89 | // failure in snapshot sending is caught and reported back to the leader; so it can resume raft 90 | // log probing in the follower. 91 | virtual void report_snapshot(uint64_t id, SnapshotStatus status) = 0; 92 | 93 | // stop performs any necessary termination of the Node. 94 | virtual void stop() = 0; 95 | 96 | // start_node returns a new Node given configuration and a list of raft peers. 97 | // It appends a ConfChangeAddNode entry for each given peer to the initial log. 98 | static Node* start_node(const Config& conf, const std::vector& peers); 99 | 100 | // restart_node is similar to start_node but does not take a list of peers. 101 | // The current membership of the cluster will be restored from the Storage. 102 | // If the caller has an existing state machine, pass in the last log index that 103 | // has been applied to it; otherwise use zero. 104 | static Node* restart_node(const Config& conf); 105 | }; 106 | 107 | // RawNode is a thread-unsafe Node. 108 | // The methods of this struct correspond to the methods of Node and are described 109 | // more fully there. 110 | class RawNode : public Node { 111 | public: 112 | explicit RawNode(const Config& conf, const std::vector& peers); 113 | explicit RawNode(const Config& conf); 114 | 115 | ~RawNode() = default; 116 | 117 | void tick() final; 118 | Status campaign() final; 119 | Status propose(std::vector data) final; 120 | Status propose_conf_change(const proto::ConfChange& cc) final; 121 | Status step(proto::MessagePtr msg) final; 122 | ReadyPtr ready() final; 123 | bool has_ready() final; 124 | void advance(ReadyPtr rd) final; 125 | proto::ConfStatePtr apply_conf_change(const proto::ConfChange& cc) final; 126 | void transfer_leadership(uint64_t lead, ino64_t transferee) final; 127 | Status read_index(std::vector rctx) final; 128 | RaftStatusPtr raft_status() final; 129 | void report_unreachable(uint64_t id) final; 130 | void report_snapshot(uint64_t id, SnapshotStatus status) final; 131 | void stop() final; 132 | public: 133 | RaftPtr raft_; 134 | SoftStatePtr prev_soft_state_; 135 | proto::HardState prev_hard_state_; 136 | }; 137 | 138 | } -------------------------------------------------------------------------------- /raft-kv/raft/storage.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | namespace kv { 6 | 7 | Status MemoryStorage::initial_state(proto::HardState& hard_state, proto::ConfState& conf_state) { 8 | hard_state = hard_state_; 9 | 10 | // copy 11 | conf_state = snapshot_->metadata.conf_state; 12 | return Status::ok(); 13 | } 14 | 15 | void MemoryStorage::set_hard_state(proto::HardState& hard_state) { 16 | std::lock_guard guard(mutex_); 17 | hard_state_ = hard_state; 18 | } 19 | 20 | Status MemoryStorage::entries(uint64_t low, 21 | uint64_t high, 22 | uint64_t max_size, 23 | std::vector& entries) { 24 | assert(low < high); 25 | std::lock_guard guard(mutex_); 26 | 27 | uint64_t offset = entries_[0]->index; 28 | if (low <= offset) { 29 | return Status::invalid_argument("requested index is unavailable due to compaction"); 30 | } 31 | uint64_t last = 0; 32 | this->last_index_impl(last); 33 | 34 | if (high > last + 1) { 35 | LOG_FATAL("entries' hi(%lu) is out of bound last_index(%lu)", high, last); 36 | } 37 | // only contains dummy entries. 38 | if (entries_.size() == 1) { 39 | return Status::invalid_argument("requested entry at index is unavailable"); 40 | } 41 | 42 | for (uint64_t i = low - offset; i < high - offset; ++i) { 43 | entries.push_back(entries_[i]); 44 | } 45 | entry_limit_size(max_size, entries); 46 | return Status::ok(); 47 | } 48 | 49 | Status MemoryStorage::term(uint64_t i, uint64_t& term) { 50 | std::lock_guard guard(mutex_); 51 | 52 | uint64_t offset = entries_[0]->index; 53 | 54 | if (i < offset) { 55 | return Status::invalid_argument("requested index is unavailable due to compaction"); 56 | } 57 | 58 | if (i - offset >= entries_.size()) { 59 | return Status::invalid_argument("requested entry at index is unavailable"); 60 | } 61 | term = entries_[i - offset]->term; 62 | return Status::ok(); 63 | } 64 | 65 | Status MemoryStorage::last_index(uint64_t& index) { 66 | std::lock_guard guard(mutex_); 67 | return last_index_impl(index); 68 | } 69 | 70 | Status MemoryStorage::first_index(uint64_t& index) { 71 | std::lock_guard guard(mutex_); 72 | return first_index_impl(index); 73 | } 74 | 75 | Status MemoryStorage::snapshot(proto::SnapshotPtr& snapshot) { 76 | std::lock_guard guard(mutex_); 77 | snapshot = snapshot_; 78 | return Status::ok(); 79 | } 80 | 81 | Status MemoryStorage::compact(uint64_t compact_index) { 82 | std::lock_guard guard(mutex_); 83 | 84 | uint64_t offset = entries_[0]->index; 85 | 86 | if (compact_index <= offset) { 87 | return Status::invalid_argument("requested index is unavailable due to compaction"); 88 | } 89 | 90 | uint64_t last_idx; 91 | this->last_index_impl(last_idx); 92 | if (compact_index > last_idx) { 93 | LOG_FATAL("compact %lu is out of bound lastindex(%lu)", compact_index, last_idx); 94 | } 95 | 96 | uint64_t i = compact_index - offset; 97 | entries_[0]->index = entries_[i]->index; 98 | entries_[0]->term = entries_[i]->term; 99 | 100 | entries_.erase(entries_.begin() + 1, entries_.begin() + i + 1); 101 | return Status::ok(); 102 | } 103 | 104 | Status MemoryStorage::append(std::vector entries) { 105 | if (entries.empty()) { 106 | return Status::ok(); 107 | } 108 | 109 | std::lock_guard guard(mutex_); 110 | 111 | uint64_t first = 0; 112 | first_index_impl(first); 113 | uint64_t last = entries[0]->index + entries.size() - 1; 114 | 115 | // shortcut if there is no new entry. 116 | if (last < first) { 117 | return Status::ok(); 118 | } 119 | 120 | // truncate compacted entries 121 | if (first > entries[0]->index) { 122 | uint64_t n = first - entries[0]->index; 123 | // first 之前的 entry 已经进入 snapshot, 丢弃 124 | entries.erase(entries.begin(), entries.begin() + n); 125 | } 126 | 127 | uint64_t offset = entries[0]->index - entries_[0]->index; 128 | 129 | if (entries_.size() > offset) { 130 | //MemoryStorage [first, offset] 被保留, offset 之后的丢弃 131 | entries_.erase(entries_.begin() + offset, entries_.end()); 132 | entries_.insert(entries_.end(), entries.begin(), entries.end()); 133 | } else if (entries_.size() == offset) { 134 | entries_.insert(entries_.end(), entries.begin(), entries.end()); 135 | } else { 136 | uint64_t last_idx; 137 | last_index_impl(last_idx); 138 | LOG_FATAL("missing log entry [last: %lu, append at: %lu", last_idx, entries[0]->index); 139 | } 140 | return Status::ok(); 141 | } 142 | 143 | Status MemoryStorage::create_snapshot(uint64_t index, 144 | proto::ConfStatePtr cs, 145 | std::vector data, 146 | proto::SnapshotPtr& snapshot) { 147 | std::lock_guard guard(mutex_); 148 | 149 | if (index <= snapshot_->metadata.index) { 150 | snapshot = std::make_shared(); 151 | return Status::invalid_argument("requested index is older than the existing snapshot"); 152 | } 153 | 154 | uint64_t offset = entries_[0]->index; 155 | uint64_t last = 0; 156 | last_index_impl(last); 157 | if (index > last) { 158 | LOG_FATAL("snapshot %lu is out of bound lastindex(%lu)", index, last); 159 | } 160 | 161 | snapshot_->metadata.index = index; 162 | snapshot_->metadata.term = entries_[index - offset]->term; 163 | if (cs) { 164 | snapshot_->metadata.conf_state = *cs; 165 | } 166 | snapshot_->data = std::move(data); 167 | snapshot = snapshot_; 168 | return Status::ok(); 169 | 170 | } 171 | 172 | Status MemoryStorage::apply_snapshot(const proto::Snapshot& snapshot) { 173 | std::lock_guard guard(mutex_); 174 | 175 | uint64_t index = snapshot_->metadata.index; 176 | uint64_t snap_index = snapshot.metadata.index; 177 | 178 | if (index >= snap_index) { 179 | return Status::invalid_argument("requested index is older than the existing snapshot"); 180 | } 181 | 182 | snapshot_ = std::make_shared(snapshot); 183 | 184 | entries_.resize(1); 185 | proto::EntryPtr entry(new proto::Entry()); 186 | entry->term = snapshot_->metadata.term; 187 | entry->index = snapshot_->metadata.index; 188 | entries_[0] = std::move(entry); 189 | return Status::ok(); 190 | } 191 | 192 | Status MemoryStorage::last_index_impl(uint64_t& index) { 193 | index = entries_[0]->index + entries_.size() - 1; 194 | return Status::ok(); 195 | } 196 | 197 | Status MemoryStorage::first_index_impl(uint64_t& index) { 198 | index = entries_[0]->index + 1; 199 | return Status::ok(); 200 | } 201 | 202 | } -------------------------------------------------------------------------------- /raft-kv/raft/raft.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace kv { 12 | 13 | class Raft { 14 | public: 15 | explicit Raft(const Config& c); 16 | 17 | virtual ~Raft(); 18 | 19 | void tick(); 20 | 21 | void become_follower(uint64_t term, uint64_t lead); 22 | 23 | void become_candidate(); 24 | 25 | void become_pre_candidate(); 26 | 27 | void become_leader(); 28 | 29 | // campaign_type represents the type of campaigning 30 | // the reason we use the type of string instead of uint64 31 | // is because it's simpler to compare and fill in raft entries 32 | void campaign(const std::string& campaign_type); 33 | 34 | uint32_t poll(uint64_t id, proto::MessageType type, bool v); 35 | 36 | virtual Status step(proto::MessagePtr msg); 37 | 38 | Status step_leader(proto::MessagePtr msg); 39 | 40 | // step_candidate is shared by StateCandidate and StatePreCandidate; the difference is 41 | // whether they respond to MsgVoteResp or MsgPreVoteResp. 42 | Status step_candidate(proto::MessagePtr msg); 43 | 44 | Status step_follower(proto::MessagePtr msg); 45 | 46 | void handle_append_entries(proto::MessagePtr msg); 47 | 48 | void handle_heartbeat(proto::MessagePtr msg); 49 | 50 | void handle_snapshot(proto::MessagePtr msg); 51 | 52 | bool restore(const proto::Snapshot& snapshot); 53 | 54 | void send(proto::MessagePtr msg); 55 | 56 | void restore_node(const std::vector& nodes, bool is_learner); 57 | 58 | // promotable indicates whether state machine can be promoted to leader, 59 | // which is true when its own id is in progress list. 60 | bool promotable() const; 61 | 62 | void add_node(uint64_t id); 63 | 64 | void add_node_or_learner(uint64_t id, bool is_learner); 65 | 66 | void remove_node(uint64_t id); 67 | 68 | uint32_t quorum() const { 69 | return static_cast(prs_.size() / 2 + 1); 70 | } 71 | 72 | SoftStatePtr soft_state() const; 73 | 74 | proto::HardState hard_state() const; 75 | 76 | void load_state(const proto::HardState& state); 77 | 78 | void nodes(std::vector& node) const; 79 | 80 | void learner_nodes(std::vector& learner) const; 81 | 82 | ProgressPtr get_progress(uint64_t id); 83 | 84 | void set_progress(uint64_t id, uint64_t match, uint64_t next, bool is_learner); 85 | 86 | void del_progress(uint64_t id); 87 | 88 | // sendAppend sends an append RPC with new entries (if any) and the 89 | // current commit index to the given peer. 90 | void send_append(uint64_t to); 91 | 92 | // maybe_send_append sends an append RPC with new entries to the given peer, 93 | // if necessary. Returns true if a message was sent. The sendIfEmpty 94 | // argument controls whether messages with no entries will be sent 95 | // ("empty" messages are useful to convey updated Commit indexes, but 96 | // are undesirable when we're sending multiple messages in a batch). 97 | bool maybe_send_append(uint64_t to, bool send_if_empty); 98 | 99 | // send_heartbeat sends a heartbeat RPC to the given peer. 100 | void send_heartbeat(uint64_t to, std::vector ctx); 101 | 102 | void for_each_progress(const std::function& callback); 103 | 104 | // bcast_append sends RPC, with entries to all peers that are not up-to-date 105 | // according to the progress recorded in prs_. 106 | void bcast_append(); 107 | 108 | void bcast_heartbeat(); 109 | 110 | void bcast_heartbeat_with_ctx(const std::vector& ctx); 111 | 112 | // maybe_commit attempts to advance the commit index. Returns true if 113 | // the commit index changed (in which case the caller should call 114 | // bcast_append). 115 | bool maybe_commit(); 116 | 117 | void reset(uint64_t term); 118 | 119 | bool append_entry(const std::vector& entries); 120 | 121 | // tick_election is run by followers and candidates after ElectionTimeout. 122 | void tick_election(); 123 | 124 | void tick_heartbeat(); 125 | 126 | // past_election_timeout returns true if r.electionElapsed is greater 127 | // than or equal to the randomized election timeout in 128 | // [electiontimeout, 2 * electiontimeout - 1]. 129 | bool past_election_timeout(); 130 | 131 | void reset_randomized_election_timeout(); 132 | 133 | bool check_quorum_active(); 134 | 135 | void send_timeout_now(uint64_t to); 136 | 137 | void abort_leader_transfer(); 138 | 139 | // increase_uncommitted_size computes the size of the proposed entries and 140 | // determines whether they would push leader over its maxUncommittedSize limit. 141 | // If the new entries would exceed the limit, the method returns false. If not, 142 | // the increase in uncommitted entry size is recorded and the method returns 143 | // true. 144 | bool increase_uncommitted_size(const std::vector& entries); 145 | 146 | // reduce_uncommitted_size accounts for the newly committed entries by decreasing 147 | // the uncommitted entry size limit. 148 | void reduce_uncommitted_size(const std::vector& entries); 149 | 150 | virtual std::vector read_messages() { 151 | std::vector ret; 152 | ret.swap(msgs_); 153 | msgs_.clear(); 154 | return ret; 155 | } 156 | 157 | public: 158 | uint64_t id_; 159 | 160 | uint64_t term_; 161 | uint64_t vote_; 162 | 163 | std::vector read_states_; 164 | 165 | // the log 166 | RaftLogPtr raft_log_; 167 | 168 | uint64_t max_msg_size_; 169 | uint64_t max_uncommitted_size_; 170 | uint64_t max_inflight_; 171 | std::unordered_map prs_; 172 | std::unordered_map learner_prs_; 173 | std::vector match_buf_; 174 | 175 | RaftState state_; 176 | 177 | // is_learner_ is true if the local raft node is a learner. 178 | bool is_learner_; 179 | 180 | std::unordered_map votes_; 181 | 182 | std::vector msgs_; 183 | 184 | // the leader id 185 | uint64_t lead_; 186 | 187 | // lead_transferee_ is id of the leader transfer target when its value is not zero. 188 | // Follow the procedure defined in raft thesis 3.10. 189 | uint64_t lead_transferee_; 190 | // Only one conf change may be pending (in the log, but not yet 191 | // applied) at a time. This is enforced via pending_conf_index_, which 192 | // is set to a value >= the log index of the latest pending 193 | // configuration change (if any). Config changes are only allowed to 194 | // be proposed if the leader's applied index is greater than this 195 | // value. 196 | uint64_t pending_conf_index_; 197 | // an estimate of the size of the uncommitted tail of the Raft log. Used to 198 | // prevent unbounded log growth. Only maintained by the leader. Reset on 199 | // term changes. 200 | uint64_t uncommitted_size_; 201 | 202 | ReadOnlyPtr read_only_; 203 | 204 | // number of ticks since it reached last election_elapsed_ when it is leader 205 | // or candidate. 206 | // number of ticks since it reached last electionTimeout or received a 207 | // valid message from current leader when it is a follower. 208 | uint32_t election_elapsed_; 209 | 210 | // number of ticks since it reached last heartbeat_elapsed_. 211 | // only leader keeps heartbeatElapsed. 212 | uint32_t heartbeat_elapsed_; 213 | 214 | bool check_quorum_; 215 | bool pre_vote_; 216 | 217 | uint32_t heartbeat_timeout_; 218 | uint32_t election_timeout_; 219 | // randomized_election_timeout_ is a random number between 220 | // [randomized_election_timeout_, 2 * randomized_election_timeout_ - 1]. It gets reset 221 | // when raft changes its state to follower or candidate. 222 | uint32_t randomized_election_timeout_; 223 | 224 | bool disable_proposal_forwarding_; 225 | 226 | std::function tick_; 227 | std::function step_; 228 | RandomDevice random_device_; 229 | }; 230 | typedef std::shared_ptr RaftPtr; 231 | 232 | } 233 | -------------------------------------------------------------------------------- /raft-kv/raft/node.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | namespace kv { 5 | 6 | Node* Node::start_node(const Config& conf, const std::vector& peers) { 7 | return new RawNode(conf, peers); 8 | } 9 | 10 | Node* Node::restart_node(const Config& conf) { 11 | return new RawNode(conf); 12 | } 13 | 14 | RawNode::RawNode(const Config& conf, const std::vector& peers) { 15 | raft_ = std::make_shared(conf); 16 | 17 | uint64_t last_index = 0; 18 | Status status = conf.storage->last_index(last_index); 19 | if (!status.is_ok()) { 20 | LOG_FATAL("%s", status.to_string().c_str()); 21 | } 22 | 23 | // If the log is empty, this is a new RawNode (like StartNode); otherwise it's 24 | // restoring an existing RawNode (like RestartNode). 25 | if (last_index == 0) { 26 | raft_->become_follower(1, 0); 27 | 28 | std::vector entries; 29 | 30 | for (size_t i = 0; i < peers.size(); ++i) { 31 | auto& peer = peers[i]; 32 | proto::ConfChange cs = proto::ConfChange{ 33 | .id = 0, 34 | .conf_change_type = proto::ConfChangeAddNode, 35 | .node_id = peer.id, 36 | .context = peer.context, 37 | }; 38 | 39 | std::vector data = cs.serialize(); 40 | 41 | proto::EntryPtr entry(new proto::Entry()); 42 | entry->type = proto::EntryConfChange; 43 | entry->term = 1; 44 | entry->index = i + 1; 45 | entry->data = std::move(data); 46 | entries.push_back(entry); 47 | } 48 | 49 | raft_->raft_log_->append(entries); 50 | raft_->raft_log_->committed_ = entries.size(); 51 | 52 | for (auto& peer : peers) { 53 | raft_->add_node(peer.id); 54 | } 55 | } 56 | 57 | // Set the initial hard and soft states after performing all initialization. 58 | prev_soft_state_ = raft_->soft_state(); 59 | if (last_index == 0) { 60 | prev_hard_state_ = proto::HardState(); 61 | } else { 62 | prev_hard_state_ = raft_->hard_state(); 63 | } 64 | } 65 | 66 | RawNode::RawNode(const Config& conf) { 67 | 68 | uint64_t last_index = 0; 69 | Status status = conf.storage->last_index(last_index); 70 | if (!status.is_ok()) { 71 | LOG_FATAL("%s", status.to_string().c_str()); 72 | } 73 | 74 | raft_ = std::make_shared(conf); 75 | 76 | // Set the initial hard and soft states after performing all initialization. 77 | prev_soft_state_ = raft_->soft_state(); 78 | if (last_index == 0) { 79 | prev_hard_state_ = proto::HardState(); 80 | } else { 81 | prev_hard_state_ = raft_->hard_state(); 82 | } 83 | } 84 | 85 | 86 | void RawNode::tick() { 87 | raft_->tick(); 88 | } 89 | 90 | Status RawNode::campaign() { 91 | proto::MessagePtr msg(new proto::Message()); 92 | msg->type = proto::MsgHup; 93 | return raft_->step(std::move(msg)); 94 | } 95 | 96 | Status RawNode::propose(std::vector data) { 97 | proto::MessagePtr msg(new proto::Message()); 98 | msg->type = proto::MsgProp; 99 | msg->from = raft_->id_; 100 | msg->entries.emplace_back(proto::EntryNormal, 0, 0, std::move(data)); 101 | 102 | return raft_->step(std::move(msg)); 103 | } 104 | 105 | Status RawNode::propose_conf_change(const proto::ConfChange& cc) { 106 | proto::MessagePtr msg(new proto::Message()); 107 | msg->type = proto::MsgProp; 108 | msg->entries.emplace_back(proto::EntryConfChange, 0, 0, cc.serialize()); 109 | return raft_->step(std::move(msg)); 110 | } 111 | 112 | Status RawNode::step(proto::MessagePtr msg) { 113 | // ignore unexpected local messages receiving over network 114 | if (msg->is_local_msg()) { 115 | return Status::invalid_argument("raft: cannot step raft local message"); 116 | } 117 | 118 | ProgressPtr progress = raft_->get_progress(msg->from); 119 | if (progress || !msg->is_response_msg()) { 120 | return raft_->step(msg); 121 | } 122 | return Status::invalid_argument("raft: cannot step as peer not found"); 123 | } 124 | 125 | ReadyPtr RawNode::ready() { 126 | ReadyPtr rd = std::make_shared(raft_, prev_soft_state_, prev_hard_state_); 127 | raft_->msgs_.clear(); 128 | raft_->reduce_uncommitted_size(rd->committed_entries); 129 | return rd; 130 | } 131 | 132 | bool RawNode::has_ready() { 133 | assert(prev_soft_state_); 134 | if (!raft_->soft_state()->equal(*prev_soft_state_)) { 135 | return true; 136 | } 137 | proto::HardState hs = raft_->hard_state(); 138 | if (!hs.is_empty_state() && !hs.equal(prev_hard_state_)) { 139 | return true; 140 | } 141 | 142 | proto::SnapshotPtr snapshot = raft_->raft_log_->unstable_->snapshot_; 143 | 144 | if (snapshot && !snapshot->is_empty()) { 145 | return true; 146 | } 147 | if (!raft_->msgs_.empty() || !raft_->raft_log_->unstable_entries().empty() 148 | || raft_->raft_log_->has_next_entries()) { 149 | return true; 150 | } 151 | 152 | return !raft_->read_states_.empty(); 153 | } 154 | 155 | void RawNode::advance(ReadyPtr rd) { 156 | if (rd->soft_state) { 157 | prev_soft_state_ = rd->soft_state; 158 | 159 | } 160 | if (!rd->hard_state.is_empty_state()) { 161 | prev_hard_state_ = rd->hard_state; 162 | } 163 | 164 | // If entries were applied (or a snapshot), update our cursor for 165 | // the next Ready. Note that if the current HardState contains a 166 | // new Commit index, this does not mean that we're also applying 167 | // all of the new entries due to commit pagination by size. 168 | uint64_t index = rd->applied_cursor(); 169 | if (index > 0) { 170 | raft_->raft_log_->applied_to(index); 171 | } 172 | 173 | if (!rd->entries.empty()) { 174 | auto& entry = rd->entries.back(); 175 | raft_->raft_log_->stable_to(entry->index, entry->term); 176 | } 177 | 178 | if (!rd->snapshot.is_empty()) { 179 | raft_->raft_log_->stable_snap_to(rd->snapshot.metadata.index); 180 | } 181 | 182 | if (!rd->read_states.empty()) { 183 | raft_->read_states_.clear(); 184 | } 185 | } 186 | 187 | proto::ConfStatePtr RawNode::apply_conf_change(const proto::ConfChange& cc) { 188 | proto::ConfStatePtr state(new proto::ConfState()); 189 | if (cc.node_id == 0) { 190 | raft_->nodes(state->nodes); 191 | raft_->learner_nodes(state->learners); 192 | return state; 193 | } 194 | 195 | switch (cc.conf_change_type) { 196 | case proto::ConfChangeAddNode: { 197 | raft_->add_node_or_learner(cc.node_id, false); 198 | break; 199 | } 200 | case proto::ConfChangeAddLearnerNode: { 201 | raft_->add_node_or_learner(cc.node_id, true); 202 | break; 203 | } 204 | case proto::ConfChangeRemoveNode: { 205 | raft_->remove_node(cc.node_id); 206 | break; 207 | } 208 | case proto::ConfChangeUpdateNode: { 209 | LOG_DEBUG("ConfChangeUpdate"); 210 | break; 211 | } 212 | default: { 213 | LOG_FATAL("unexpected conf type"); 214 | } 215 | } 216 | raft_->nodes(state->nodes); 217 | raft_->learner_nodes(state->learners); 218 | return state; 219 | } 220 | 221 | void RawNode::transfer_leadership(uint64_t lead, ino64_t transferee) { 222 | // manually set 'from' and 'to', so that leader can voluntarily transfers its leadership 223 | proto::MessagePtr msg(new proto::Message()); 224 | msg->type = proto::MsgTransferLeader; 225 | msg->from = transferee; 226 | msg->to = lead; 227 | 228 | Status status = raft_->step(std::move(msg)); 229 | if (!status.is_ok()) { 230 | LOG_WARN("transfer_leadership %s", status.to_string().c_str()); 231 | } 232 | } 233 | 234 | Status RawNode::read_index(std::vector rctx) { 235 | proto::MessagePtr msg(new proto::Message()); 236 | msg->type = proto::MsgReadIndex; 237 | msg->entries.emplace_back(proto::MsgReadIndex, 0, 0, std::move(rctx)); 238 | return raft_->step(std::move(msg)); 239 | } 240 | 241 | RaftStatusPtr RawNode::raft_status() { 242 | LOG_DEBUG("no impl yet"); 243 | return nullptr; 244 | } 245 | 246 | void RawNode::report_unreachable(uint64_t id) { 247 | proto::MessagePtr msg(new proto::Message()); 248 | msg->type = proto::MsgUnreachable; 249 | msg->from = id; 250 | 251 | Status status = raft_->step(std::move(msg)); 252 | if (!status.is_ok()) { 253 | LOG_WARN("report_unreachable %s", status.to_string().c_str()); 254 | } 255 | } 256 | 257 | void RawNode::report_snapshot(uint64_t id, SnapshotStatus status) { 258 | bool rej = (status == SnapshotFailure); 259 | proto::MessagePtr msg(new proto::Message()); 260 | msg->type = proto::MsgSnapStatus; 261 | msg->from = id; 262 | msg->reject = rej; 263 | 264 | Status s = raft_->step(std::move(msg)); 265 | if (!s.is_ok()) { 266 | LOG_WARN("report_snapshot %s", s.to_string().c_str()); 267 | } 268 | } 269 | 270 | void RawNode::stop() { 271 | 272 | } 273 | 274 | } -------------------------------------------------------------------------------- /raft-kv/raft/raft_log.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | namespace kv { 6 | 7 | RaftLog::RaftLog(StoragePtr storage, uint64_t max_next_ents_size) 8 | : storage_(std::move(storage)), 9 | committed_(0), 10 | applied_(0), 11 | max_next_ents_size_(max_next_ents_size) { 12 | assert(storage_); 13 | uint64_t first; 14 | auto status = storage_->first_index(first); 15 | assert(status.is_ok()); 16 | 17 | uint64_t last; 18 | status = storage_->last_index(last); 19 | assert(status.is_ok()); 20 | 21 | unstable_ = std::make_shared(last + 1); 22 | 23 | // Initialize our committed and applied pointers to the time of the last compaction. 24 | applied_ = committed_ = first - 1; 25 | } 26 | RaftLog::~RaftLog() { 27 | 28 | } 29 | 30 | void RaftLog::maybe_append(uint64_t index, 31 | uint64_t log_term, 32 | uint64_t committed, 33 | std::vector entries, 34 | uint64_t& last_new_index, 35 | bool& ok) { 36 | if (match_term(index, log_term)) { 37 | uint64_t lastnewi = index + entries.size(); 38 | uint64_t ci = find_conflict(entries); 39 | if (ci == 0) { 40 | //no conflict 41 | } else if (ci <= committed_) { 42 | LOG_FATAL("entry %lu conflict with committed entry [committed(%lu)]", ci, committed_); 43 | } else { 44 | assert(ci > 0); 45 | uint64_t offset = index + 1; 46 | uint64_t n = ci - offset; 47 | entries.erase(entries.begin(), entries.begin() + n); 48 | append(std::move(entries)); 49 | } 50 | 51 | commit_to(std::min(committed, lastnewi)); 52 | 53 | last_new_index = lastnewi; 54 | ok = true; 55 | return; 56 | } else { 57 | last_new_index = 0; 58 | ok = false; 59 | } 60 | } 61 | 62 | uint64_t RaftLog::append(std::vector entries) { 63 | if (entries.empty()) { 64 | return last_index(); 65 | } 66 | 67 | uint64_t after = entries[0]->index - 1; 68 | if (after < committed_) { 69 | LOG_FATAL("after(%lu) is out of range [committed(%lu)]\", after, committed_", after, committed_); 70 | } 71 | 72 | unstable_->truncate_and_append(std::move(entries)); 73 | return last_index(); 74 | } 75 | 76 | uint64_t RaftLog::find_conflict(const std::vector& entries) { 77 | for (const proto::EntryPtr& entry : entries) { 78 | if (!match_term(entry->index, entry->term)) { 79 | if (entry->index < last_index()) { 80 | uint64_t t; 81 | Status status = this->term(entry->index, t); 82 | LOG_INFO("found conflict at index %lu [existing term: %lu, conflicting term: %lu], %s", 83 | entry->index, 84 | t, 85 | entry->term, 86 | status.to_string().c_str()); 87 | } 88 | return entry->index; 89 | } 90 | } 91 | return 0; 92 | } 93 | 94 | void RaftLog::next_entries(std::vector& entries) const { 95 | uint64_t off = std::max(applied_ + 1, first_index()); 96 | if (committed_ + 1 > off) { 97 | Status status = slice(off, committed_ + 1, max_next_ents_size_, entries); 98 | if (!status.is_ok()) { 99 | LOG_FATAL("unexpected error when getting unapplied entries"); 100 | } 101 | } 102 | } 103 | 104 | bool RaftLog::has_next_entries() const { 105 | uint64_t off = std::max(applied_ + 1, first_index()); 106 | return committed_ + 1 > off; 107 | } 108 | 109 | bool RaftLog::maybe_commit(uint64_t max_index, uint64_t term) { 110 | if (max_index > committed_) { 111 | uint64_t t; 112 | this->term(max_index, t); 113 | if (t == term) { 114 | commit_to(max_index); 115 | return true; 116 | } 117 | } 118 | return false; 119 | } 120 | 121 | void RaftLog::restore(proto::SnapshotPtr snapshot) { 122 | LOG_INFO("log starts to restore snapshot [index: %lu, term: %lu]", 123 | snapshot->metadata.index, 124 | snapshot->metadata.term); 125 | committed_ = snapshot->metadata.index; 126 | unstable_->restore(std::move(snapshot)); 127 | } 128 | 129 | Status RaftLog::snapshot(proto::SnapshotPtr& snap) const { 130 | if (unstable_->snapshot_) { 131 | snap = unstable_->snapshot_; 132 | return Status::ok(); 133 | } 134 | 135 | proto::SnapshotPtr s; 136 | Status status = storage_->snapshot(s); 137 | if (s) { 138 | snap = s; 139 | } 140 | return status; 141 | } 142 | 143 | void RaftLog::applied_to(uint64_t index) { 144 | if (index == 0) { 145 | return; 146 | } 147 | if (committed_ < index || index < applied_) { 148 | LOG_ERROR("applied(%lu) is out of range [prevApplied(%lu), committed(%lu)]", index, applied_, committed_); 149 | } 150 | applied_ = index; 151 | } 152 | 153 | Status RaftLog::slice(uint64_t low, uint64_t high, uint64_t max_size, std::vector& entries) const { 154 | Status status = must_check_out_of_bounds(low, high); 155 | if (!status.is_ok()) { 156 | return status; 157 | } 158 | if (low == high) { 159 | return Status::ok(); 160 | } 161 | 162 | //slice from storage_ 163 | if (low < unstable_->offset_) { 164 | status = storage_->entries(low, std::min(high, unstable_->offset_), max_size, entries); 165 | if (!status.is_ok()) { 166 | return status; 167 | } 168 | 169 | // check if ents has reached the size limitation 170 | if (entries.size() < std::min(high, unstable_->offset_) - low) { 171 | return Status::ok(); 172 | } 173 | 174 | } 175 | 176 | //slice unstable 177 | if (high > unstable_->offset_) { 178 | std::vector unstable; 179 | unstable_->slice(std::max(low, unstable_->offset_), high, entries); 180 | entries.insert(entries.end(), unstable.begin(), unstable.end()); 181 | } 182 | entry_limit_size(max_size, entries); 183 | return Status::ok(); 184 | } 185 | 186 | void RaftLog::commit_to(uint64_t to_commit) { 187 | // never decrease commit 188 | if (committed_ < to_commit) { 189 | if (last_index() < to_commit) { 190 | LOG_FATAL("to_commit(%lu) is out of range [lastIndex(%lu)]. Was the raft log corrupted, truncated, or lost?", 191 | to_commit, 192 | last_index()); 193 | } 194 | committed_ = to_commit; 195 | } else { 196 | //ignore to_commit < committed_ 197 | } 198 | } 199 | 200 | bool RaftLog::match_term(uint64_t index, uint64_t t) { 201 | uint64_t term_out; 202 | Status status = this->term(index, term_out); 203 | if (!status.is_ok()) { 204 | return false; 205 | } 206 | return t == term_out; 207 | } 208 | 209 | uint64_t RaftLog::last_term() const { 210 | uint64_t t; 211 | Status status = term(last_index(), t); 212 | assert(status.is_ok()); 213 | return t; 214 | } 215 | 216 | Status RaftLog::term(uint64_t index, uint64_t& t) const { 217 | uint64_t dummy_index = first_index() - 1; 218 | if (index < dummy_index || index > last_index()) { 219 | // TODO: return an error instead? 220 | t = 0; 221 | return Status::ok(); 222 | } 223 | 224 | uint64_t term_index; 225 | bool ok; 226 | 227 | unstable_->maybe_term(index, term_index, ok); 228 | if (ok) { 229 | t = term_index; 230 | return Status::ok(); 231 | } 232 | 233 | Status status = storage_->term(index, term_index); 234 | if (status.is_ok()) { 235 | t = term_index; 236 | } 237 | return status; 238 | } 239 | 240 | uint64_t RaftLog::first_index() const { 241 | uint64_t index; 242 | bool ok; 243 | unstable_->maybe_first_index(index, ok); 244 | if (ok) { 245 | return index; 246 | } 247 | 248 | Status status = storage_->first_index(index); 249 | assert(status.is_ok()); 250 | 251 | return index; 252 | } 253 | 254 | uint64_t RaftLog::last_index() const { 255 | uint64_t index; 256 | bool ok; 257 | unstable_->maybe_last_index(index, ok); 258 | if (ok) { 259 | return index; 260 | } 261 | 262 | Status status = storage_->last_index(index); 263 | assert(status.is_ok()); 264 | 265 | return index; 266 | } 267 | 268 | void RaftLog::all_entries(std::vector& entries) { 269 | entries.clear(); 270 | Status status = this->entries(first_index(), RaftLog::unlimited(), entries); 271 | if (status.is_ok()) { 272 | return; 273 | } 274 | 275 | // try again if there was a racing compaction 276 | if (status.to_string() 277 | == Status::invalid_argument("requested index is unavailable due to compaction").to_string()) { 278 | this->all_entries(entries); 279 | } 280 | LOG_FATAL("%s", status.to_string().c_str()); 281 | } 282 | 283 | Status RaftLog::must_check_out_of_bounds(uint64_t low, uint64_t high) const { 284 | assert(high >= low); 285 | 286 | uint64_t first = first_index(); 287 | 288 | if (low < first) { 289 | return Status::invalid_argument("requested index is unavailable due to compaction"); 290 | } 291 | 292 | uint64_t length = last_index() + 1 - first; 293 | if (low < first || high > first + length) { 294 | LOG_FATAL("slice[%lu,%lu) out of bound [%lu,%lu]", low, high, first, last_index()); 295 | } 296 | return Status::ok(); 297 | 298 | } 299 | 300 | } 301 | 302 | -------------------------------------------------------------------------------- /tests/test_progress.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | using namespace kv; 5 | 6 | static bool cmp_InFlights(const InFlights& l, const InFlights& r) { 7 | return l.start == r.start && l.count == r.count && l.size == r.size && l.buffer == r.buffer; 8 | } 9 | 10 | TEST(progress, add) { 11 | 12 | InFlights in(10); 13 | in.buffer.resize(10, 0); 14 | 15 | for (uint32_t i = 0; i < 5; i++) { 16 | in.add(i); 17 | } 18 | 19 | InFlights wantIn(10); 20 | wantIn.start = 0; 21 | wantIn.count = 5; 22 | wantIn.buffer = std::vector{0, 1, 2, 3, 4, 0, 0, 0, 0, 0}; 23 | 24 | ASSERT_TRUE(cmp_InFlights(wantIn, in)); 25 | 26 | InFlights wantIn2(10); 27 | wantIn.start = 0; 28 | wantIn.count = 10; 29 | wantIn.buffer = std::vector{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; 30 | ASSERT_FALSE(cmp_InFlights(wantIn2, in)); 31 | 32 | // rotating case 33 | InFlights in2(10); 34 | in2.start = 5; 35 | in2.size = 10; 36 | in2.buffer.resize(10, 0); 37 | 38 | for (uint32_t i = 0; i < 5; i++) { 39 | in2.add(i); 40 | } 41 | 42 | InFlights wantIn21(10); 43 | wantIn.start = 5; 44 | wantIn.count = 5; 45 | wantIn.buffer = std::vector{0, 0, 0, 0, 0, 0, 1, 2, 3, 4}; 46 | ASSERT_FALSE(cmp_InFlights(wantIn2, in2)); 47 | 48 | for (uint32_t i = 0; i < 5; i++) { 49 | in2.add(i); 50 | } 51 | 52 | InFlights wantIn22(10); 53 | wantIn.start = 10; 54 | wantIn.count = 10; 55 | wantIn.buffer = std::vector{5, 6, 7, 8, 9, 0, 1, 2, 3, 4}; 56 | ASSERT_FALSE(cmp_InFlights(wantIn2, in2)); 57 | 58 | ASSERT_FALSE(cmp_InFlights(wantIn22, in2)); 59 | } 60 | 61 | TEST(progress, freeto) { 62 | InFlights in(10); 63 | 64 | for (uint32_t i = 0; i < 10; i++) { 65 | in.add(i); 66 | } 67 | in.free_to(4); 68 | 69 | InFlights wantIn(10); 70 | wantIn.start = 5; 71 | wantIn.count = 5; 72 | wantIn.buffer = std::vector{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; 73 | 74 | ASSERT_TRUE(cmp_InFlights(wantIn, in)); 75 | 76 | in.free_to(8); 77 | 78 | InFlights wantIn2(10); 79 | wantIn2.start = 9; 80 | wantIn2.count = 1; 81 | wantIn2.buffer = std::vector{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; 82 | ASSERT_TRUE(cmp_InFlights(wantIn2, in)); 83 | 84 | // rotating case 85 | for (uint32_t i = 10; i < 15; i++) { 86 | in.add(i); 87 | } 88 | 89 | in.free_to(12); 90 | 91 | InFlights wantIn3(10); 92 | wantIn3.start = 3; 93 | wantIn3.count = 2; 94 | wantIn3.size = 10; 95 | wantIn3.buffer = std::vector{10, 11, 12, 13, 14, 5, 6, 7, 8, 9}; 96 | ASSERT_TRUE(cmp_InFlights(wantIn3, in)); 97 | 98 | in.free_to(14); 99 | 100 | InFlights wantIn4(10); 101 | wantIn4.start = 0; 102 | wantIn4.count = 0; 103 | wantIn4.size = 10; 104 | wantIn4.buffer = std::vector{10, 11, 12, 13, 14, 5, 6, 7, 8, 9}; 105 | ASSERT_TRUE(cmp_InFlights(wantIn4, in)); 106 | } 107 | 108 | TEST(progress, FreeFirstOne) { 109 | InFlights in(10); 110 | for (uint32_t i = 0; i < 10; i++) { 111 | in.add(i); 112 | } 113 | in.free_first_one(); 114 | 115 | InFlights wantIn(10); 116 | wantIn.start = 1; 117 | wantIn.count = 9; 118 | wantIn.buffer = std::vector{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; 119 | ASSERT_TRUE(cmp_InFlights(wantIn, in)); 120 | } 121 | 122 | TEST(progress, BecomeProbe) { 123 | struct Test { 124 | ProgressPtr pr; 125 | uint64_t wnext; 126 | }; 127 | std::vector tests; 128 | { 129 | ProgressPtr pr(new Progress(256)); 130 | pr->state = ProgressStateReplicate; 131 | pr->match = 1; 132 | pr->next = 5; 133 | tests.push_back(Test{.pr = pr, .wnext = 2}); 134 | } 135 | 136 | { 137 | ProgressPtr pr(new Progress(256)); 138 | pr->state = ProgressStateSnapshot; 139 | pr->match = 1; 140 | pr->next = 5; 141 | pr->pending_snapshot = 10; 142 | tests.push_back(Test{.pr = pr, .wnext = 11}); 143 | } 144 | 145 | { 146 | ProgressPtr pr(new Progress(256)); 147 | pr->state = ProgressStateSnapshot; 148 | pr->match = 1; 149 | pr->next = 5; 150 | pr->pending_snapshot = 0; 151 | tests.push_back(Test{.pr = pr, .wnext = 2}); 152 | } 153 | 154 | for (Test& test : tests) { 155 | test.pr->become_probe(); 156 | ASSERT_TRUE(test.pr->match == 1); 157 | ASSERT_TRUE(test.pr->state == ProgressStateProbe); 158 | ASSERT_TRUE(test.pr->next == test.wnext); 159 | } 160 | } 161 | 162 | TEST(progress, BecomeReplicate) { 163 | ProgressPtr pr(new Progress(256)); 164 | pr->match = 1; 165 | pr->next = 5; 166 | pr->become_replicate(); 167 | ASSERT_TRUE(pr->next = pr->match + 1); 168 | ASSERT_TRUE(pr->state = ProgressStateReplicate); 169 | } 170 | 171 | TEST(progress, BecomeSnapshot) { 172 | ProgressPtr pr(new Progress(256)); 173 | pr->match = 1; 174 | pr->next = 5; 175 | pr->become_snapshot(10); 176 | ASSERT_TRUE(pr->match == 1); 177 | ASSERT_TRUE(pr->state == ProgressStateSnapshot); 178 | ASSERT_TRUE(pr->pending_snapshot == 10); 179 | } 180 | 181 | TEST(progress, Update) { 182 | uint64_t prevM = 3; 183 | uint64_t prevN = 5; 184 | 185 | struct Test { 186 | uint64_t update; 187 | uint64_t wm; 188 | uint64_t wn; 189 | bool wok; 190 | }; 191 | std::vector tests; 192 | tests.push_back(Test{.update = prevM - 1, .wm = prevM, .wn = prevN, .wok = false}); 193 | tests.push_back(Test{.update = prevM, .wm = prevM, .wn = prevN, .wok = false}); 194 | tests.push_back(Test{.update = prevM + 1, .wm = prevM + 1, .wn = prevN, .wok = true}); 195 | tests.push_back(Test{.update = prevM + 2, .wm = prevM + 2, .wn = prevN + 1, .wok = true}); 196 | 197 | for (Test& test: tests) { 198 | ProgressPtr pr(new Progress(256)); 199 | pr->match = prevM; 200 | pr->next = prevN; 201 | 202 | bool ok = pr->maybe_update(test.update); 203 | ASSERT_TRUE(ok == test.wok); 204 | ASSERT_TRUE(pr->match == test.wm); 205 | ASSERT_TRUE(pr->next == test.wn); 206 | } 207 | } 208 | 209 | TEST(progress, MaybeDecr) { 210 | struct Test { 211 | ProgressState state; 212 | uint64_t m; 213 | uint64_t n; 214 | uint64_t rejected; 215 | uint64_t last; 216 | bool w; 217 | uint64_t wn; 218 | }; 219 | std::vector tests; 220 | 221 | // state replicate and rejected is not greater than match 222 | tests 223 | .push_back(Test{.state = ProgressStateReplicate, .m = 5, .n = 10, .rejected = 5, .last = 5, .w = false, .wn = 10}); 224 | // state replicate and rejected is not greater than match 225 | tests 226 | .push_back(Test{.state = ProgressStateReplicate, .m = 5, .n = 10, .rejected = 4, .last = 5, .w = false, .wn = 10}); 227 | // state replicate and rejected is greater than match 228 | // directly decrease to match+1 229 | tests 230 | .push_back(Test{.state = ProgressStateReplicate, .m = 5, .n = 10, .rejected = 9, .last = 9, .w = true, .wn = 6}); 231 | // next-1 != rejected is always false 232 | tests.push_back(Test{.state = ProgressStateProbe, .m = 0, .n = 0, .rejected = 0, .last = 0, .w = false, .wn = 0}); 233 | // next-1 != rejected is always false 234 | tests.push_back(Test{.state = ProgressStateProbe, .m = 0, .n = 10, .rejected = 5, .last = 5, .w = false, .wn = 10}); 235 | // next>1 = decremented by 1 236 | tests.push_back(Test{.state = ProgressStateProbe, .m = 0, .n = 10, .rejected = 9, .last = 9, .w = true, .wn = 9}); 237 | tests.push_back(Test{.state = ProgressStateProbe, .m = 0, .n = 2, .rejected = 1, .last = 1, .w = true, .wn = 1}); 238 | tests.push_back(Test{.state = ProgressStateProbe, .m = 0, .n = 1, .rejected = 0, .last = 0, .w = true, .wn = 1}); 239 | tests.push_back(Test{.state = ProgressStateProbe, .m = 0, .n = 10, .rejected = 9, .last = 2, .w = true, .wn = 3}); 240 | tests.push_back(Test{.state = ProgressStateProbe, .m = 0, .n = 10, .rejected = 9, .last = 0, .w = true, .wn = 1}); 241 | for (Test& test: tests) { 242 | ProgressPtr pr(new Progress(256)); 243 | pr->state = test.state; 244 | pr->match = test.m; 245 | pr->next = test.n; 246 | 247 | bool ok = pr->maybe_decreases_to(test.rejected, test.last); 248 | ASSERT_TRUE(ok == test.w); 249 | ASSERT_TRUE(pr->match == test.m); 250 | ASSERT_TRUE(pr->next == test.wn); 251 | } 252 | } 253 | 254 | TEST(progress, IsPaused) { 255 | struct Test { 256 | ProgressState state; 257 | bool paused; 258 | bool w; 259 | }; 260 | std::vector tests; 261 | tests.push_back(Test{.state = ProgressStateProbe, .paused = false, .w = false}); 262 | tests.push_back(Test{.state = ProgressStateProbe, .paused = true, .w = true}); 263 | tests.push_back(Test{.state = ProgressStateReplicate, .paused = false, .w = false}); 264 | tests.push_back(Test{.state = ProgressStateReplicate, .paused = true, .w = false}); 265 | tests.push_back(Test{.state = ProgressStateSnapshot, .paused = false, .w = true}); 266 | tests.push_back(Test{.state = ProgressStateSnapshot, .paused = true, .w = true}); 267 | for (Test& test: tests) { 268 | ProgressPtr pr(new Progress(256)); 269 | pr->state = test.state; 270 | pr->paused = test.paused; 271 | ASSERT_TRUE(pr->is_paused() == test.w); 272 | } 273 | } 274 | 275 | TEST(progress, resume) { 276 | ProgressPtr pr(new Progress(256)); 277 | pr->next = 2; 278 | pr->paused = true; 279 | pr->maybe_decreases_to(2, 2); 280 | ASSERT_TRUE(pr->paused); 281 | pr->maybe_update(2); 282 | ASSERT_FALSE(pr->paused); 283 | } 284 | 285 | int main(int argc, char* argv[]) { 286 | testing::InitGoogleTest(&argc, argv); 287 | return RUN_ALL_TESTS(); 288 | } 289 | -------------------------------------------------------------------------------- /raft-kv/server/redis_store.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace kv { 8 | 9 | // see redis keys command 10 | int string_match_len(const char* pattern, int patternLen, 11 | const char* string, int stringLen, int nocase) { 12 | while (patternLen && stringLen) { 13 | switch (pattern[0]) { 14 | case '*': 15 | while (pattern[1] == '*') { 16 | pattern++; 17 | patternLen--; 18 | } 19 | if (patternLen == 1) 20 | return 1; /* match */ 21 | while (stringLen) { 22 | if (string_match_len(pattern + 1, patternLen - 1, 23 | string, stringLen, nocase)) 24 | return 1; /* match */ 25 | string++; 26 | stringLen--; 27 | } 28 | return 0; /* no match */ 29 | break; 30 | case '?': 31 | if (stringLen == 0) 32 | return 0; /* no match */ 33 | string++; 34 | stringLen--; 35 | break; 36 | case '[': { 37 | int not_match, match; 38 | 39 | pattern++; 40 | patternLen--; 41 | not_match = pattern[0] == '^'; 42 | if (not_match) { 43 | pattern++; 44 | patternLen--; 45 | } 46 | match = 0; 47 | while (1) { 48 | if (pattern[0] == '\\' && patternLen >= 2) { 49 | pattern++; 50 | patternLen--; 51 | if (pattern[0] == string[0]) 52 | match = 1; 53 | } else if (pattern[0] == ']') { 54 | break; 55 | } else if (patternLen == 0) { 56 | pattern--; 57 | patternLen++; 58 | break; 59 | } else if (pattern[1] == '-' && patternLen >= 3) { 60 | int start = pattern[0]; 61 | int end = pattern[2]; 62 | int c = string[0]; 63 | if (start > end) { 64 | int t = start; 65 | start = end; 66 | end = t; 67 | } 68 | if (nocase) { 69 | start = tolower(start); 70 | end = tolower(end); 71 | c = tolower(c); 72 | } 73 | pattern += 2; 74 | patternLen -= 2; 75 | if (c >= start && c <= end) 76 | match = 1; 77 | } else { 78 | if (!nocase) { 79 | if (pattern[0] == string[0]) 80 | match = 1; 81 | } else { 82 | if (tolower((int) pattern[0]) == tolower((int) string[0])) 83 | match = 1; 84 | } 85 | } 86 | pattern++; 87 | patternLen--; 88 | } 89 | if (not_match) 90 | match = !match; 91 | if (!match) 92 | return 0; /* no match */ 93 | string++; 94 | stringLen--; 95 | break; 96 | } 97 | case '\\': 98 | if (patternLen >= 2) { 99 | pattern++; 100 | patternLen--; 101 | } 102 | /* fall through */ 103 | default: 104 | if (!nocase) { 105 | if (pattern[0] != string[0]) 106 | return 0; /* no match */ 107 | } else { 108 | if (tolower((int) pattern[0]) != tolower((int) string[0])) 109 | return 0; /* no match */ 110 | } 111 | string++; 112 | stringLen--; 113 | break; 114 | } 115 | pattern++; 116 | patternLen--; 117 | if (stringLen == 0) { 118 | while (*pattern == '*') { 119 | pattern++; 120 | patternLen--; 121 | } 122 | break; 123 | } 124 | } 125 | if (patternLen == 0 && stringLen == 0) 126 | return 1; 127 | return 0; 128 | } 129 | 130 | RedisStore::RedisStore(RaftNode* server, std::vector snap, uint16_t port) 131 | : server_(server), 132 | acceptor_(io_service_), 133 | next_request_id_(0) { 134 | 135 | if (!snap.empty()) { 136 | std::unordered_map kv; 137 | msgpack::object_handle oh = msgpack::unpack((const char*) snap.data(), snap.size()); 138 | try { 139 | oh.get().convert(kv); 140 | } catch (std::exception& e) { 141 | LOG_WARN("invalid snapshot"); 142 | } 143 | std::swap(kv, key_values_); 144 | } 145 | 146 | auto address = boost::asio::ip::address::from_string("0.0.0.0"); 147 | auto endpoint = boost::asio::ip::tcp::endpoint(address, port); 148 | 149 | acceptor_.open(endpoint.protocol()); 150 | acceptor_.set_option(boost::asio::ip::tcp::acceptor::reuse_address(1)); 151 | acceptor_.bind(endpoint); 152 | acceptor_.listen(); 153 | } 154 | 155 | RedisStore::~RedisStore() { 156 | if (worker_.joinable()) { 157 | worker_.join(); 158 | } 159 | } 160 | 161 | void RedisStore::start(std::promise& promise) { 162 | start_accept(); 163 | 164 | worker_ = std::thread([this, &promise]() { 165 | promise.set_value(pthread_self()); 166 | this->io_service_.run(); 167 | }); 168 | } 169 | 170 | void RedisStore::start_accept() { 171 | RedisSessionPtr session(new RedisSession(this, io_service_)); 172 | 173 | acceptor_.async_accept(session->socket_, [this, session](const boost::system::error_code& error) { 174 | if (error) { 175 | LOG_DEBUG("accept error %s", error.message().c_str()); 176 | return; 177 | } 178 | this->start_accept(); 179 | session->start(); 180 | }); 181 | } 182 | 183 | void RedisStore::set(std::string key, std::string value, const StatusCallback& callback) { 184 | uint32_t commit_id = next_request_id_++; 185 | 186 | RaftCommit commit; 187 | commit.node_id = static_cast(server_->node_id()); 188 | commit.commit_id = commit_id; 189 | commit.redis_data.type = RedisCommitData::kCommitSet; 190 | commit.redis_data.strs.push_back(std::move(key)); 191 | commit.redis_data.strs.push_back(std::move(value)); 192 | 193 | msgpack::sbuffer sbuf; 194 | msgpack::pack(sbuf, commit); 195 | std::shared_ptr> data(new std::vector(sbuf.data(), sbuf.data() + sbuf.size())); 196 | 197 | pending_requests_[commit_id] = callback; 198 | 199 | server_->propose(std::move(data), [this, commit_id](const Status& status) { 200 | io_service_.post([this, status, commit_id]() { 201 | if (status.is_ok()) { 202 | return; 203 | } 204 | 205 | auto it = pending_requests_.find(commit_id); 206 | if (it != pending_requests_.end()) { 207 | it->second(status); 208 | pending_requests_.erase(it); 209 | } 210 | }); 211 | }); 212 | } 213 | 214 | void RedisStore::del(std::vector keys, const StatusCallback& callback) { 215 | uint32_t commit_id = next_request_id_++; 216 | 217 | RaftCommit commit; 218 | commit.node_id = static_cast(server_->node_id()); 219 | commit.commit_id = commit_id; 220 | commit.redis_data.type = RedisCommitData::kCommitDel; 221 | commit.redis_data.strs = std::move(keys); 222 | msgpack::sbuffer sbuf; 223 | msgpack::pack(sbuf, commit); 224 | std::shared_ptr> data(new std::vector(sbuf.data(), sbuf.data() + sbuf.size())); 225 | 226 | pending_requests_[commit_id] = callback; 227 | 228 | server_->propose(std::move(data), [this, commit_id](const Status& status) { 229 | io_service_.post([commit_id, status, this]() { 230 | 231 | auto it = pending_requests_.find(commit_id); 232 | if (it != pending_requests_.end()) { 233 | it->second(status); 234 | pending_requests_.erase(it); 235 | } 236 | }); 237 | }); 238 | } 239 | 240 | void RedisStore::get_snapshot(const GetSnapshotCallback& callback) { 241 | io_service_.post([this, callback] { 242 | msgpack::sbuffer sbuf; 243 | msgpack::pack(sbuf, this->key_values_); 244 | SnapshotDataPtr data(new std::vector(sbuf.data(), sbuf.data() + sbuf.size())); 245 | callback(data); 246 | }); 247 | } 248 | 249 | void RedisStore::recover_from_snapshot(SnapshotDataPtr snap, const StatusCallback& callback) { 250 | io_service_.post([this, snap, callback] { 251 | std::unordered_map kv; 252 | msgpack::object_handle oh = msgpack::unpack((const char*) snap->data(), snap->size()); 253 | try { 254 | oh.get().convert(kv); 255 | } catch (std::exception& e) { 256 | callback(Status::io_error("invalid snapshot")); 257 | return; 258 | } 259 | std::swap(kv, key_values_); 260 | callback(Status::ok()); 261 | }); 262 | } 263 | 264 | void RedisStore::keys(const char* pattern, int len, std::vector& keys) { 265 | for (auto it = key_values_.begin(); it != key_values_.end(); ++it) { 266 | if (string_match_len(pattern, len, it->first.c_str(), it->first.size(), 0)) { 267 | keys.push_back(it->first); 268 | } 269 | } 270 | } 271 | 272 | void RedisStore::read_commit(proto::EntryPtr entry) { 273 | auto cb = [this, entry] { 274 | RaftCommit commit; 275 | try { 276 | msgpack::object_handle oh = msgpack::unpack((const char*) entry->data.data(), entry->data.size()); 277 | oh.get().convert(commit); 278 | 279 | } 280 | catch (std::exception& e) { 281 | LOG_ERROR("bad entry %s", e.what()); 282 | return; 283 | } 284 | RedisCommitData& data = commit.redis_data; 285 | 286 | switch (data.type) { 287 | case RedisCommitData::kCommitSet: { 288 | assert(data.strs.size() == 2); 289 | this->key_values_[std::move(data.strs[0])] = std::move(data.strs[1]); 290 | break; 291 | } 292 | case RedisCommitData::kCommitDel: { 293 | for (const std::string& key : data.strs) { 294 | this->key_values_.erase(key); 295 | } 296 | break; 297 | } 298 | default: { 299 | LOG_ERROR("not supported type %d", data.type); 300 | } 301 | } 302 | 303 | if (commit.node_id == server_->node_id()) { 304 | auto it = pending_requests_.find(commit.commit_id); 305 | if (it != pending_requests_.end()) { 306 | it->second(Status::ok()); 307 | pending_requests_.erase(it); 308 | } 309 | } 310 | }; 311 | 312 | io_service_.post(std::move(cb)); 313 | } 314 | 315 | } 316 | 317 | -------------------------------------------------------------------------------- /tests/network.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace kv; 9 | 10 | bool entry_cmp(const std::vector& left, const std::vector& right) { 11 | if (left.size() != right.size()) { 12 | return false; 13 | } 14 | 15 | for (size_t i = 0; i < left.size(); ++i) { 16 | if (left[i]->index != right[i]->index) { 17 | return false; 18 | } 19 | 20 | if (left[i]->term != right[i]->term) { 21 | return false; 22 | } 23 | if (left[i]->type != right[i]->type) { 24 | return false; 25 | } 26 | if (left[i]->data != right[i]->data) { 27 | return false; 28 | } 29 | } 30 | return true; 31 | } 32 | 33 | std::vector nextEnts(RaftPtr r, MemoryStoragePtr s) { 34 | s->append(r->raft_log_->unstable_entries()); 35 | r->raft_log_->stable_to(r->raft_log_->last_index(), r->raft_log_->last_term()); 36 | 37 | std::vector ents; 38 | r->raft_log_->next_entries(ents); 39 | r->raft_log_->applied_to(r->raft_log_->committed_); 40 | return ents; 41 | } 42 | 43 | typedef std::function ConfigFunc; 44 | 45 | Config newTestConfig(uint64_t id, 46 | std::vector peers, 47 | uint32_t election, 48 | uint32_t heartbeat, 49 | StoragePtr storage) { 50 | Config c; 51 | c.id = id; 52 | c.peers = peers; 53 | c.election_tick = election; 54 | c.heartbeat_tick = heartbeat; 55 | c.storage = storage; 56 | c.max_size_per_msg = std::numeric_limits::max(); 57 | c.max_inflight_msgs = 256; 58 | c.validate(); 59 | return c; 60 | } 61 | 62 | RaftPtr entsWithConfig(ConfigFunc configFunc, std::vector terms) { 63 | MemoryStoragePtr storage(new MemoryStorage()); 64 | for (size_t i = 0; i < terms.size(); ++i) { 65 | uint64_t term = terms[i]; 66 | std::vector entries; 67 | proto::EntryPtr e(new proto::Entry()); 68 | e->index = i + 1; 69 | e->term = term; 70 | entries.push_back(e); 71 | storage->append(entries); 72 | } 73 | auto 74 | cfg = newTestConfig(1, std::vector(), 5, 1, storage); 75 | if (configFunc) { 76 | configFunc(cfg); 77 | } 78 | 79 | RaftPtr sm(new Raft(cfg)); 80 | sm->reset(terms.back()); 81 | return sm; 82 | } 83 | 84 | std::vector str_to_vector(const char* str) { 85 | size_t len = strlen(str); 86 | std::vector data(str, str + len); 87 | return data; 88 | } 89 | 90 | RaftPtr newTestRaft(uint64_t id, 91 | std::vector peers, 92 | uint64_t election, 93 | uint64_t heartbeat, 94 | StoragePtr storage) { 95 | Config c = newTestConfig(id, peers, election, heartbeat, storage); 96 | c.max_inflight_msgs = 256; 97 | Status status = c.validate(); 98 | assert(status.is_ok()); 99 | return std::make_shared(c); 100 | } 101 | 102 | RaftPtr newTestLearnerRaft(uint64_t id, 103 | std::vector peers, 104 | std::vector learners, 105 | uint64_t election, 106 | uint64_t heartbeat, 107 | StoragePtr storage) { 108 | Config c = newTestConfig(id, peers, election, heartbeat, storage); 109 | c.learners = learners; 110 | c.max_inflight_msgs = 256; 111 | Status status = c.validate(); 112 | assert(status.is_ok()); 113 | return std::make_shared(c); 114 | } 115 | 116 | RaftPtr votedWithConfig(ConfigFunc configFunc, uint64_t vote, uint64_t term) { 117 | MemoryStoragePtr storage(new MemoryStorage()); 118 | proto::HardState hs; 119 | hs.term = term; 120 | hs.vote = vote; 121 | 122 | storage->set_hard_state(hs); 123 | Config cfg = newTestConfig(1, std::vector(), 5, 1, storage); 124 | if (configFunc) { 125 | configFunc(cfg); 126 | } 127 | cfg.validate(); 128 | RaftPtr sm(new Raft(cfg)); 129 | sm->reset(term); 130 | return sm; 131 | } 132 | 133 | struct connem { 134 | uint64_t from; 135 | uint64_t to; 136 | }; 137 | 138 | bool operator==(const connem& lhs, const connem& rhs) { 139 | return lhs.from == rhs.from && lhs.to == rhs.to; 140 | } 141 | 142 | namespace std { 143 | 144 | template<> 145 | struct hash { 146 | std::size_t operator()(const connem& c) const { 147 | return boost::hash_value(tie(c.from, c.to)); 148 | } 149 | }; 150 | 151 | } 152 | 153 | std::vector idsBySize(size_t size) { 154 | std::vector ids(size); 155 | for (size_t i = 0; i < size; i++) { 156 | ids[i] = 1 + i; 157 | } 158 | return ids; 159 | } 160 | 161 | void preVoteConfig(Config& c) { 162 | c.pre_vote = true; 163 | } 164 | 165 | class BlackHole : public Raft { 166 | public: 167 | explicit BlackHole() 168 | : Raft(newTestConfig(0, std::vector{1, 2, 3}, 1, 2, std::make_shared())) {} 169 | 170 | virtual std::vector read_messages() { 171 | std::vector ret; 172 | return ret; 173 | } 174 | 175 | virtual Status step(proto::MessagePtr msg) { 176 | return Status::ok(); 177 | } 178 | 179 | }; 180 | 181 | struct Network { 182 | explicit Network(const std::vector& peers) 183 | : Network([](Config& c) {}, peers) { 184 | 185 | } 186 | 187 | explicit Network(const ConfigFunc& configFunc, const std::vector& peers) 188 | : dev(0, 100) { 189 | size_t size = peers.size(); 190 | auto peerAddrs = idsBySize(size); 191 | 192 | for (size_t j = 0; j < peers.size(); ++j) { 193 | RaftPtr p = peers[j]; 194 | uint64_t id = peerAddrs[j]; 195 | 196 | if (p == nullptr) { 197 | auto mem = std::make_shared(); 198 | storage[id] = mem; 199 | Config cfg = newTestConfig(id, peerAddrs, 10, 1, mem); 200 | configFunc(cfg); 201 | RaftPtr sm(new Raft(cfg)); 202 | this->peers[id] = sm; 203 | continue; 204 | } 205 | 206 | std::shared_ptr bh = std::dynamic_pointer_cast(p); 207 | if (bh == nullptr) { 208 | LOG_DEBUG("Raft instance") 209 | std::unordered_map learners; 210 | for (auto it = p->learner_prs_.begin(); it != p->learner_prs_.end(); ++it) { 211 | learners[it->first] = true; 212 | } 213 | p->id_ = id; 214 | p->prs_.clear(); 215 | p->learner_prs_.clear(); 216 | 217 | for (size_t i = 0; i < size; i++) { 218 | auto it = learners.find(peerAddrs[i]); 219 | ProgressPtr pr(new Progress(0)); 220 | 221 | if (it != learners.end()) { 222 | pr->is_learner = true; 223 | p->learner_prs_[peerAddrs[i]] = pr; 224 | } else { 225 | pr->is_learner = false; 226 | p->prs_[peerAddrs[i]] = pr; 227 | } 228 | } 229 | p->reset(p->term_); 230 | this->peers[id] = p; 231 | } else { 232 | LOG_DEBUG("BlackHole instance") 233 | this->peers[id] = bh; 234 | } 235 | } 236 | 237 | } 238 | void cut(uint64_t one, uint64_t other) { 239 | drop(one, other, 2.0); // always drop 240 | drop(other, one, 2.0); // always drop 241 | } 242 | 243 | void drop(uint64_t from, uint64_t to, float perc) { 244 | connem cn; 245 | cn.to = to; 246 | cn.from = from; 247 | dropm[cn] = perc; 248 | } 249 | 250 | void isolate(uint64_t id) { 251 | for (size_t i = 0; i < peers.size(); ++i) { 252 | uint64_t nid = i + 1; 253 | if (nid != id) { 254 | 255 | drop(id, nid, 1.0); // always drop 256 | drop(nid, id, 1.0); // always drop 257 | } 258 | } 259 | } 260 | 261 | void ignore(proto::MessageType t) { 262 | ignorem[t] = true; 263 | } 264 | 265 | void recover() { 266 | dropm.clear(); 267 | ignorem.clear(); 268 | } 269 | 270 | void send(proto::MessagePtr msg) { 271 | std::vector msgs{msg}; 272 | this->send(msgs); 273 | } 274 | 275 | void send(std::vector& msgs) { 276 | std::deque queue; 277 | for (proto::MessagePtr m: msgs) { 278 | queue.push_back(m); 279 | } 280 | while (!queue.empty()) { 281 | auto m = queue.front(); 282 | queue.pop_front(); 283 | auto p = peers[m->to]; 284 | p->step(m); 285 | auto ms = p->read_messages(); 286 | ms = filter(ms); 287 | for (proto::MessagePtr m: ms) { 288 | queue.push_back(m); 289 | } 290 | } 291 | } 292 | 293 | std::vector filter(const std::vector& msgs) { 294 | //return msgs; 295 | std::vector mm; 296 | 297 | for (proto::MessagePtr m : msgs) { 298 | if (ignorem[m->type]) { 299 | continue; 300 | } 301 | 302 | switch (m->type) { 303 | case proto::MsgHup: 304 | // hups never go over the network, so don't drop them but panic 305 | LOG_FATAL("unexpected msgHup"); 306 | default: { 307 | connem c; 308 | c.from = m->from; 309 | c.to = m->to; 310 | auto perc = dropm[c]; 311 | 312 | auto n = (float) dev.gen(); 313 | if (n < perc * 100) { 314 | LOG_DEBUG("drop message, %lu, %lu, %s", m->from, m->to, proto::msg_type_to_string(m->type)); 315 | continue; 316 | } 317 | } 318 | } 319 | 320 | if (this->msgHook) { 321 | if (!this->msgHook(m)) { 322 | continue; 323 | } 324 | } 325 | mm.push_back(m); 326 | 327 | } 328 | return mm; 329 | } 330 | 331 | RandomDevice dev; 332 | std::unordered_map peers; 333 | std::unordered_map storage; 334 | std::unordered_map dropm; 335 | std::unordered_map ignorem; 336 | 337 | // msgHook is called for each message sent. It may inspect the 338 | // message and return true to send it or false to drop it. 339 | 340 | std::function msgHook; 341 | }; 342 | typedef std::shared_ptr NetworkPtr; 343 | -------------------------------------------------------------------------------- /raft-kv/server/redis_session.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace kv { 8 | 9 | #define RECEIVE_BUFFER_SIZE (1024 * 512) 10 | 11 | namespace shared { 12 | 13 | static const char* ok = "+OK\r\n"; 14 | static const char* err = "-ERR %s\r\n"; 15 | static const char* wrong_type = "-WRONGTYPE Operation against a key holding the wrong kind of value\r\n"; 16 | static const char* unknown_command = "-ERR unknown command `%s`\r\n"; 17 | static const char* wrong_number_arguments = "-ERR wrong number of arguments for '%s' command\r\n"; 18 | static const char* pong = "+PONG\r\n"; 19 | static const char* null = "$-1\r\n"; 20 | 21 | typedef std::function CommandCallback; 22 | 23 | static std::unordered_map command_table = { 24 | {"ping", RedisSession::ping_command}, 25 | {"PING", RedisSession::ping_command}, 26 | {"get", RedisSession::get_command}, 27 | {"GET", RedisSession::get_command}, 28 | {"set", RedisSession::set_command}, 29 | {"SET", RedisSession::set_command}, 30 | {"del", RedisSession::del_command}, 31 | {"DEL", RedisSession::del_command}, 32 | {"keys", RedisSession::keys_command}, 33 | {"KEYS", RedisSession::keys_command}, 34 | }; 35 | 36 | } 37 | 38 | static void build_redis_string_array_reply(const std::vector& strs, std::string& reply) { 39 | //*2\r\n$4\r\nkey1\r\n$4key2\r\n 40 | 41 | char buffer[64]; 42 | snprintf(buffer, sizeof(buffer), "*%lu\r\n", strs.size()); 43 | reply.append(buffer); 44 | 45 | for (const std::string& str : strs) { 46 | snprintf(buffer, sizeof(buffer), "$%lu\r\n", str.size()); 47 | reply.append(buffer); 48 | 49 | if (!str.empty()) { 50 | reply.append(str); 51 | reply.append("\r\n"); 52 | } 53 | } 54 | } 55 | 56 | RedisSession::RedisSession(RedisStore* server, boost::asio::io_service& io_service) 57 | : quit_(false), 58 | server_(server), 59 | socket_(io_service), 60 | read_buffer_(RECEIVE_BUFFER_SIZE), 61 | reader_(redisReaderCreate()) { 62 | } 63 | 64 | void RedisSession::start() { 65 | if (quit_) { 66 | return; 67 | } 68 | auto self = shared_from_this(); 69 | auto buffer = boost::asio::buffer(read_buffer_.data(), read_buffer_.size()); 70 | auto handler = [self](const boost::system::error_code& error, size_t bytes) { 71 | if (bytes == 0) { 72 | return; 73 | } 74 | if (error) { 75 | LOG_DEBUG("read error %s", error.message().c_str()); 76 | return; 77 | } 78 | 79 | self->handle_read(bytes); 80 | 81 | }; 82 | socket_.async_read_some(buffer, std::move(handler)); 83 | } 84 | 85 | void RedisSession::handle_read(size_t bytes) { 86 | uint8_t* start = read_buffer_.data(); 87 | uint8_t* end = read_buffer_.data() + bytes; 88 | int err = REDIS_OK; 89 | std::vector replies; 90 | 91 | while (!quit_ && start < end) { 92 | uint8_t* p = (uint8_t*) memchr(start, '\n', bytes); 93 | if (!p) { 94 | this->start(); 95 | break; 96 | } 97 | 98 | size_t n = p + 1 - start; 99 | err = redisReaderFeed(reader_, (const char*) start, n); 100 | if (err != REDIS_OK) { 101 | LOG_DEBUG("redis protocol error %d, %s", err, reader_->errstr); 102 | quit_ = true; 103 | break; 104 | } 105 | 106 | struct redisReply* reply = NULL; 107 | err = redisReaderGetReply(reader_, (void**) &reply); 108 | if (err != REDIS_OK) { 109 | LOG_DEBUG("redis protocol error %d, %s", err, reader_->errstr); 110 | quit_ = true; 111 | break; 112 | } 113 | if (reply) { 114 | replies.push_back(reply); 115 | } 116 | 117 | start += n; 118 | bytes -= n; 119 | } 120 | if (err == REDIS_OK) { 121 | for (struct redisReply* reply : replies) { 122 | on_redis_reply(reply); 123 | } 124 | this->start(); 125 | } 126 | 127 | for (struct redisReply* reply : replies) { 128 | freeReplyObject(reply); 129 | } 130 | } 131 | 132 | void RedisSession::on_redis_reply(struct redisReply* reply) { 133 | char buffer[256]; 134 | if (reply->type != REDIS_REPLY_ARRAY) { 135 | LOG_WARN("wrong type %d", reply->type); 136 | send_reply(shared::wrong_type, strlen(shared::wrong_type)); 137 | return; 138 | } 139 | 140 | if (reply->elements < 1) { 141 | LOG_WARN("wrong elements %lu", reply->elements); 142 | int n = snprintf(buffer, sizeof(buffer), shared::wrong_number_arguments, ""); 143 | send_reply(buffer, n); 144 | return; 145 | } 146 | 147 | if (reply->element[0]->type != REDIS_REPLY_STRING) { 148 | LOG_WARN("wrong type %d", reply->element[0]->type); 149 | send_reply(shared::wrong_type, strlen(shared::wrong_type)); 150 | return; 151 | } 152 | 153 | std::string command(reply->element[0]->str, reply->element[0]->len); 154 | auto it = shared::command_table.find(command); 155 | if (it == shared::command_table.end()) { 156 | int n = snprintf(buffer, sizeof(buffer), shared::unknown_command, command.c_str()); 157 | send_reply(buffer, n); 158 | return; 159 | } 160 | shared::CommandCallback& cb = it->second; 161 | cb(shared_from_this(), reply); 162 | } 163 | 164 | void RedisSession::send_reply(const char* data, uint32_t len) { 165 | uint32_t bytes = send_buffer_.readable_bytes(); 166 | send_buffer_.put((uint8_t*) data, len); 167 | if (bytes == 0) { 168 | start_send(); 169 | } 170 | } 171 | 172 | void RedisSession::start_send() { 173 | if (!send_buffer_.readable()) { 174 | return; 175 | } 176 | auto self = shared_from_this(); 177 | uint32_t remaining = send_buffer_.readable_bytes(); 178 | auto buffer = boost::asio::buffer(send_buffer_.reader(), remaining); 179 | auto handler = [self](const boost::system::error_code& error, std::size_t bytes) { 180 | if (bytes == 0) { 181 | return;; 182 | } 183 | if (error) { 184 | LOG_DEBUG("send error %s", error.message().c_str()); 185 | return; 186 | } 187 | std::string str((const char*) self->send_buffer_.reader(), bytes); 188 | self->send_buffer_.read_bytes(bytes); 189 | self->start_send(); 190 | }; 191 | boost::asio::async_write(socket_, buffer, std::move(handler)); 192 | } 193 | 194 | void RedisSession::ping_command(std::shared_ptr self, struct redisReply* reply) { 195 | self->send_reply(shared::pong, strlen(shared::pong)); 196 | } 197 | 198 | void RedisSession::get_command(std::shared_ptr self, struct redisReply* reply) { 199 | assert(reply->type = REDIS_REPLY_ARRAY); 200 | assert(reply->elements > 0); 201 | char buffer[256]; 202 | 203 | if (reply->elements != 2) { 204 | LOG_WARN("wrong elements %lu", reply->elements); 205 | int n = snprintf(buffer, sizeof(buffer), shared::wrong_number_arguments, "get"); 206 | self->send_reply(buffer, n); 207 | return; 208 | } 209 | 210 | if (reply->element[1]->type != REDIS_REPLY_STRING) { 211 | LOG_WARN("wrong type %d", reply->element[1]->type); 212 | self->send_reply(shared::wrong_type, strlen(shared::wrong_type)); 213 | return; 214 | } 215 | 216 | std::string value; 217 | std::string key(reply->element[1]->str, reply->element[1]->len); 218 | bool get = self->server_->get(key, value); 219 | if (!get) { 220 | self->send_reply(shared::null, strlen(shared::null)); 221 | } else { 222 | char* str = g_strdup_printf("$%lu\r\n%s\r\n", value.size(), value.c_str()); 223 | self->send_reply(str, strlen(str)); 224 | g_free(str); 225 | } 226 | } 227 | 228 | void RedisSession::set_command(std::shared_ptr self, struct redisReply* reply) { 229 | assert(reply->type = REDIS_REPLY_ARRAY); 230 | assert(reply->elements > 0); 231 | char buffer[256]; 232 | 233 | if (reply->elements != 3) { 234 | LOG_WARN("wrong elements %lu", reply->elements); 235 | int n = snprintf(buffer, sizeof(buffer), shared::wrong_number_arguments, "set"); 236 | self->send_reply(buffer, n); 237 | return; 238 | } 239 | 240 | if (reply->element[1]->type != REDIS_REPLY_STRING || reply->element[2]->type != REDIS_REPLY_STRING) { 241 | LOG_WARN("wrong type %d", reply->element[1]->type); 242 | self->send_reply(shared::wrong_type, strlen(shared::wrong_type)); 243 | return; 244 | } 245 | std::string key(reply->element[1]->str, reply->element[1]->len); 246 | std::string value(reply->element[2]->str, reply->element[2]->len); 247 | self->server_->set(std::move(key), std::move(value), [self](const Status& status) { 248 | if (status.is_ok()) { 249 | self->send_reply(shared::ok, strlen(shared::ok)); 250 | } else { 251 | char buff[256]; 252 | int n = snprintf(buff, sizeof(buff), shared::err, status.to_string().c_str()); 253 | self->send_reply(buff, n); 254 | } 255 | }); 256 | } 257 | 258 | void RedisSession::del_command(std::shared_ptr self, struct redisReply* reply) { 259 | assert(reply->type = REDIS_REPLY_ARRAY); 260 | assert(reply->elements > 0); 261 | char buffer[256]; 262 | 263 | if (reply->elements <= 1) { ; 264 | int n = snprintf(buffer, sizeof(buffer), shared::wrong_number_arguments, "del"); 265 | self->send_reply(buffer, n); 266 | return; 267 | } 268 | 269 | std::vector keys; 270 | for (size_t i = 1; i < reply->elements; ++i) { 271 | redisReply* element = reply->element[i]; 272 | if (element->type != REDIS_REPLY_STRING) { 273 | self->send_reply(shared::wrong_type, strlen(shared::wrong_type)); 274 | return; 275 | } 276 | 277 | keys.emplace_back(element->str, element->len); 278 | } 279 | 280 | self->server_->del(std::move(keys), [self](const Status& status) { 281 | if (status.is_ok()) { 282 | self->send_reply(shared::ok, strlen(shared::ok)); 283 | } else { 284 | char buff[256]; 285 | int n = snprintf(buff, sizeof(buff), shared::err, status.to_string().c_str()); 286 | self->send_reply(buff, n); 287 | } 288 | }); 289 | } 290 | 291 | void RedisSession::keys_command(std::shared_ptr self, struct redisReply* reply) { 292 | assert(reply->type = REDIS_REPLY_ARRAY); 293 | assert(reply->elements > 0); 294 | char buffer[256]; 295 | 296 | if (reply->elements != 2) { ; 297 | int n = snprintf(buffer, sizeof(buffer), shared::wrong_number_arguments, "keys"); 298 | self->send_reply(buffer, n); 299 | return; 300 | } 301 | 302 | redisReply* element = reply->element[1]; 303 | 304 | if (element->type != REDIS_REPLY_STRING) { 305 | self->send_reply(shared::wrong_type, strlen(shared::wrong_type)); 306 | return; 307 | } 308 | 309 | std::vector keys; 310 | self->server_->keys(element->str, element->len, keys); 311 | std::string str; 312 | build_redis_string_array_reply(keys, str); 313 | self->send_reply(str.data(), str.size()); 314 | } 315 | 316 | } 317 | -------------------------------------------------------------------------------- /raft-kv/wal/wal.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | namespace kv { 11 | 12 | static const WAL_type wal_InvalidType = 0; 13 | static const WAL_type wal_EntryType = 1; 14 | static const WAL_type wal_StateType = 2; 15 | static const WAL_type wal_CrcType = 3; 16 | static const WAL_type wal_snapshot_Type = 4; 17 | static const int SegmentSizeBytes = 64 * 1000 * 1000; // 64MB 18 | 19 | static std::string wal_name(uint64_t seq, uint64_t index) { 20 | char buffer[64]; 21 | snprintf(buffer, sizeof(buffer), "%016" PRIx64 "-%016" PRIx64 ".wal", seq, index); 22 | return buffer; 23 | } 24 | 25 | class WAL_File { 26 | public: 27 | WAL_File(const char* path, int64_t seq) 28 | : seq(seq), 29 | file_size(0) { 30 | fp = fopen(path, "a+"); 31 | if (!fp) { 32 | LOG_FATAL("fopen error %s", strerror(errno)); 33 | } 34 | 35 | file_size = ftell(fp); 36 | if (file_size == -1) { 37 | LOG_FATAL("ftell error %s", strerror(errno)); 38 | } 39 | 40 | if (fseek(fp, 0L, SEEK_SET) == -1) { 41 | LOG_FATAL("fseek error %s", strerror(errno)); 42 | } 43 | } 44 | 45 | ~WAL_File() { 46 | fclose(fp); 47 | } 48 | 49 | void truncate(size_t offset) { 50 | if (ftruncate(fileno(fp), offset) != 0) { 51 | LOG_FATAL("ftruncate error %s", strerror(errno)); 52 | } 53 | 54 | if (fseek(fp, offset, SEEK_SET) == -1) { 55 | LOG_FATAL("fseek error %s", strerror(errno)); 56 | } 57 | 58 | file_size = offset; 59 | data_buffer.clear(); 60 | } 61 | 62 | void append(WAL_type type, const uint8_t* data, size_t len) { 63 | WAL_Record record; 64 | record.type = type; 65 | record.crc = compute_crc32((char*) data, len); 66 | set_WAL_Record_len(record, len); 67 | uint8_t* ptr = (uint8_t*) &record; 68 | data_buffer.insert(data_buffer.end(), ptr, ptr + sizeof(record)); 69 | data_buffer.insert(data_buffer.end(), data, data + len); 70 | } 71 | 72 | void sync() { 73 | if (data_buffer.empty()) { 74 | return; 75 | } 76 | 77 | size_t bytes = fwrite(data_buffer.data(), 1, data_buffer.size(), fp); 78 | if (bytes != data_buffer.size()) { 79 | LOG_FATAL("fwrite error %s", strerror(errno)); 80 | } 81 | 82 | file_size += data_buffer.size(); 83 | data_buffer.clear(); 84 | } 85 | 86 | void read_all(std::vector& out) { 87 | char buffer[1024]; 88 | while (true) { 89 | size_t bytes = fread(buffer, 1, sizeof(buffer), fp); 90 | if (bytes <= 0) { 91 | break; 92 | } 93 | 94 | out.insert(out.end(), buffer, buffer + bytes); 95 | } 96 | 97 | fseek(fp, 0L, SEEK_END); 98 | } 99 | 100 | std::vector data_buffer; 101 | int64_t seq; 102 | long file_size; 103 | FILE* fp; 104 | }; 105 | 106 | void WAL::create(const std::string& dir) { 107 | using namespace boost; 108 | 109 | filesystem::path walFile = filesystem::path(dir) / wal_name(0, 0); 110 | std::string tmpPath = walFile.string() + ".tmp"; 111 | 112 | if (filesystem::exists(tmpPath)) { 113 | filesystem::remove(tmpPath); 114 | } 115 | 116 | { 117 | std::shared_ptr wal(new WAL_File(tmpPath.c_str(), 0)); 118 | WAL_Snapshot snap; 119 | snap.term = 0; 120 | snap.index = 0; 121 | msgpack::sbuffer sbuf; 122 | msgpack::pack(sbuf, snap); 123 | wal->append(wal_snapshot_Type, (uint8_t*) sbuf.data(), sbuf.size()); 124 | wal->sync(); 125 | } 126 | 127 | filesystem::rename(tmpPath, walFile); 128 | } 129 | 130 | WAL_ptr WAL::open(const std::string& dir, const WAL_Snapshot& snap) { 131 | WAL_ptr w(new WAL(dir)); 132 | 133 | std::vector names; 134 | w->get_wal_names(dir, names); 135 | if (names.empty()) { 136 | LOG_FATAL("wal not found"); 137 | } 138 | 139 | uint64_t nameIndex; 140 | if (!WAL::search_index(names, snap.index, &nameIndex)) { 141 | LOG_FATAL("wal not found"); 142 | } 143 | 144 | std::vector check_names(names.begin() + nameIndex, names.end()); 145 | if (!WAL::is_valid_seq(check_names)) { 146 | LOG_FATAL("invalid wal seq"); 147 | } 148 | 149 | for (const std::string& name: check_names) { 150 | uint64_t seq; 151 | uint64_t index; 152 | if (!parse_wal_name(name, &seq, &index)) { 153 | LOG_FATAL("invalid wal name %s", name.c_str()); 154 | } 155 | 156 | boost::filesystem::path path = boost::filesystem::path(w->dir_) / name; 157 | std::shared_ptr file(new WAL_File(path.string().c_str(), seq)); 158 | w->files_.push_back(file); 159 | } 160 | 161 | memcpy(&w->start_, &snap, sizeof(snap)); 162 | return w; 163 | } 164 | 165 | Status WAL::read_all(proto::HardState& hs, std::vector& ents) { 166 | std::vector data; 167 | for (auto file : files_) { 168 | data.clear(); 169 | file->read_all(data); 170 | size_t offset = 0; 171 | bool matchsnap = false; 172 | 173 | while (offset < data.size()) { 174 | size_t left = data.size() - offset; 175 | size_t record_begin_offset = offset; 176 | 177 | if (left < sizeof(WAL_Record)) { 178 | file->truncate(record_begin_offset); 179 | LOG_WARN("invalid record len %lu", left); 180 | break; 181 | } 182 | 183 | WAL_Record record; 184 | memcpy(&record, data.data() + offset, sizeof(record)); 185 | 186 | left -= sizeof(record); 187 | offset += sizeof(record); 188 | 189 | if (record.type == wal_InvalidType) { 190 | break; 191 | } 192 | 193 | uint32_t record_data_len = WAL_Record_len(record); 194 | if (left < record_data_len) { 195 | file->truncate(record_begin_offset); 196 | LOG_WARN("invalid record data len %lu, %u", left, record_data_len); 197 | break; 198 | } 199 | 200 | char* data_ptr = data.data() + offset; 201 | uint32_t crc = compute_crc32(data_ptr, record_data_len); 202 | 203 | left -= record_data_len; 204 | offset += record_data_len; 205 | 206 | if (record.crc != 0 && crc != record.crc) { 207 | file->truncate(record_begin_offset); 208 | LOG_WARN("invalid record crc %u, %u", record.crc, crc); 209 | break; 210 | } 211 | 212 | handle_record_wal_record(record.type, data_ptr, record_data_len, matchsnap, hs, ents); 213 | 214 | if (record.type == wal_snapshot_Type) { 215 | matchsnap = true; 216 | } 217 | } 218 | 219 | if (!matchsnap) { 220 | LOG_FATAL("wal: snapshot not found"); 221 | } 222 | } 223 | 224 | return Status::ok(); 225 | } 226 | 227 | void WAL::handle_record_wal_record(WAL_type type, 228 | const char* data, 229 | size_t data_len, 230 | bool& matchsnap, 231 | proto::HardState& hs, 232 | std::vector& ents) { 233 | 234 | switch (type) { 235 | case wal_EntryType: { 236 | proto::EntryPtr entry(new proto::Entry()); 237 | msgpack::object_handle oh = msgpack::unpack(data, data_len); 238 | oh.get().convert(*entry); 239 | 240 | if (entry->index > start_.index) { 241 | ents.resize(entry->index - start_.index - 1); 242 | ents.push_back(entry); 243 | } 244 | 245 | enti_ = entry->index; 246 | break; 247 | } 248 | 249 | case wal_StateType: { 250 | msgpack::object_handle oh = msgpack::unpack(data, data_len); 251 | oh.get().convert(hs); 252 | break; 253 | } 254 | 255 | case wal_snapshot_Type: { 256 | WAL_Snapshot snap; 257 | msgpack::object_handle oh = msgpack::unpack(data, data_len); 258 | oh.get().convert(snap); 259 | 260 | if (snap.index == start_.index) { 261 | if (snap.term != start_.term) { 262 | LOG_FATAL("wal: snapshot mismatch"); 263 | } 264 | matchsnap = true; 265 | } 266 | break; 267 | } 268 | 269 | case wal_CrcType: { 270 | LOG_FATAL("wal crc type"); 271 | break; 272 | } 273 | default: { 274 | LOG_FATAL("invalid record type %d", type); 275 | } 276 | } 277 | } 278 | 279 | Status WAL::save(proto::HardState hs, const std::vector& ents) { 280 | // short cut, do not call sync 281 | if (hs.is_empty_state() && ents.empty()) { 282 | return Status::ok(); 283 | } 284 | 285 | bool mustSync = is_must_sync(hs, state_, ents.size()); 286 | Status status; 287 | 288 | for (const proto::EntryPtr& entry: ents) { 289 | status = save_entry(*entry); 290 | if (!status.is_ok()) { 291 | return status; 292 | } 293 | } 294 | 295 | status = save_hard_state(hs); 296 | if (!status.is_ok()) { 297 | return status; 298 | } 299 | 300 | if (files_.back()->file_size < SegmentSizeBytes) { 301 | if (mustSync) { 302 | files_.back()->sync(); 303 | } 304 | return Status::ok(); 305 | } 306 | 307 | return cut(); 308 | } 309 | 310 | Status WAL::cut() { 311 | files_.back()->sync(); 312 | return Status::ok(); 313 | } 314 | 315 | Status WAL::save_snapshot(const WAL_Snapshot& snap) { 316 | msgpack::sbuffer sbuf; 317 | msgpack::pack(sbuf, snap); 318 | 319 | files_.back()->append(wal_snapshot_Type, (uint8_t*)sbuf.data(), sbuf.size()); 320 | if (enti_ < snap.index) { 321 | enti_ = snap.index; 322 | } 323 | files_.back()->sync(); 324 | return Status::ok(); 325 | } 326 | 327 | Status WAL::save_entry(const proto::Entry& entry) { 328 | msgpack::sbuffer sbuf; 329 | msgpack::pack(sbuf, entry); 330 | 331 | files_.back()->append(wal_EntryType, (uint8_t*)sbuf.data(), sbuf.size()); 332 | enti_ = entry.index; 333 | return Status::ok(); 334 | } 335 | 336 | Status WAL::save_hard_state(const proto::HardState& hs) { 337 | if (hs.is_empty_state()) { 338 | return Status::ok(); 339 | } 340 | state_ = hs; 341 | 342 | msgpack::sbuffer sbuf; 343 | msgpack::pack(sbuf, hs); 344 | files_.back()->append(wal_StateType, (uint8_t*)sbuf.data(), sbuf.size()); 345 | return Status::ok(); 346 | } 347 | 348 | void WAL::get_wal_names(const std::string& dir, std::vector& names) { 349 | using namespace boost; 350 | 351 | filesystem::directory_iterator end; 352 | for (boost::filesystem::directory_iterator it(dir); it != end; it++) { 353 | filesystem::path filename = (*it).path().filename(); 354 | filesystem::path extension = filename.extension(); 355 | if (extension != ".wal") { 356 | continue; 357 | } 358 | names.push_back(filename.string()); 359 | } 360 | std::sort(names.begin(), names.end(), std::less()); 361 | } 362 | 363 | Status WAL::release_to(uint64_t index) { 364 | return Status::ok(); 365 | } 366 | 367 | bool WAL::parse_wal_name(const std::string& name, uint64_t* seq, uint64_t* index) { 368 | *seq = 0; 369 | *index = 0; 370 | 371 | boost::filesystem::path path(name); 372 | if (path.extension() != ".wal") { 373 | return false; 374 | } 375 | 376 | std::string filename = name.substr(0, name.size() - 4); // trim ".wal" 377 | size_t pos = filename.find('-'); 378 | if (pos == std::string::npos) { 379 | return false; 380 | } 381 | 382 | try { 383 | { 384 | std::string str = filename.substr(0, pos); 385 | std::stringstream ss; 386 | ss << std::hex << str; 387 | ss >> *seq; 388 | } 389 | 390 | { 391 | if (pos == filename.size() - 1) { 392 | return false; 393 | } 394 | std::string str = filename.substr(pos + 1, filename.size() - pos - 1); 395 | std::stringstream ss; 396 | ss << std::hex << str; 397 | ss >> *index; 398 | } 399 | } catch (...) { 400 | return false; 401 | } 402 | return true; 403 | } 404 | 405 | bool WAL::is_valid_seq(const std::vector& names) { 406 | uint64_t lastSeq = 0; 407 | for (const std::string& name: names) { 408 | uint64_t curSeq; 409 | uint64_t i; 410 | if (!WAL::parse_wal_name(name, &curSeq, &i)) { 411 | LOG_FATAL("parse correct name should never fail %s", name.c_str()); 412 | } 413 | 414 | if (lastSeq != 0 && lastSeq != curSeq - 1) { 415 | return false; 416 | } 417 | lastSeq = curSeq; 418 | } 419 | return true; 420 | } 421 | 422 | // searchIndex returns the last array index of names whose raft index section is 423 | // equal to or smaller than the given index. 424 | // The given names MUST be sorted. 425 | bool WAL::search_index(const std::vector& names, uint64_t index, uint64_t* name_index) { 426 | 427 | for (size_t i = names.size() - 1; i >= 0; --i) { 428 | const std::string& name = names[i]; 429 | uint64_t seq; 430 | uint64_t curIndex; 431 | if (!parse_wal_name(name, &seq, &curIndex)) { 432 | LOG_FATAL("invalid wal name %s", name.c_str()); 433 | } 434 | 435 | if (index >= curIndex) { 436 | *name_index = i; 437 | return true; 438 | } 439 | if (i == 0) { 440 | break; 441 | } 442 | } 443 | *name_index = -1; 444 | return false; 445 | } 446 | 447 | } 448 | --------------------------------------------------------------------------------