├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── README.md ├── bootstap.sh ├── deps ├── googletest-release-1.8.0.tar.gz ├── libevent-2.1.12-stable.tar.gz └── protobuf-3.14.0.tar.gz ├── include └── libraft.h ├── libraft-tests.cmake ├── libraft.cmake ├── src ├── base │ ├── crc32c.cc │ ├── crc32c.h │ ├── file.h │ ├── file_system_adaptor.cc │ ├── file_system_adaptor.h │ ├── io_buffer.cc │ ├── io_buffer.h │ ├── io_error.h │ ├── logger.cc │ ├── logger.h │ ├── mutex.cc │ ├── mutex.h │ ├── util.cc │ └── util.h ├── core │ ├── fsm_caller.cc │ ├── fsm_caller.h │ ├── node.cc │ ├── node.h │ ├── progress.cc │ ├── progress.h │ ├── raft.cc │ ├── raft.h │ ├── read_only.cc │ └── read_only.h ├── io │ ├── buffer_io_reader.cc │ ├── buffer_io_reader.h │ ├── file_io.h │ ├── io.h │ └── memory_io.h ├── proto │ ├── raft.pb.cc │ ├── raft.pb.h │ ├── raft.proto │ ├── record.pb.cc │ ├── record.pb.h │ └── record.proto ├── storage │ ├── log.cc │ ├── log.h │ ├── memory_storage.cc │ ├── memory_storage.h │ ├── unstable_log.cc │ └── unstable_log.h └── wal │ ├── decoder.cc │ ├── decoder.h │ ├── encoder.cc │ ├── encoder.h │ ├── util.h │ ├── wal.cc │ └── wal.h └── test ├── .gitignore ├── crc32c_test.cc ├── io_buffer_test.cc ├── log_test.cc ├── main.cc ├── memory_storage_test.cc ├── node_test.cc ├── progress_test.cc ├── raft_flow_controller_test.cc ├── raft_paper_test.cc ├── raft_snap_test.cc ├── raft_test.cc ├── raft_test_util.cc ├── raft_test_util.h ├── record_test.cc └── unstable_log_test.cc /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | third_party 3 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(libraft C CXX) 4 | 5 | if(NOT CMAKE_BUILD_TYPE) 6 | set(CMAKE_BUILD_TYPE "Debug") 7 | endif() 8 | 9 | set(CXX_FLAGS 10 | -g 11 | -O0 12 | # -DVALGRIND 13 | -DCHECK_PTHREAD_RETURN_VALUE 14 | -D_FILE_OFFSET_BITS=64 15 | -Wall 16 | -Wextra 17 | -Werror 18 | #-Wconversion 19 | -Wno-unused-parameter 20 | #-Wold-style-cast 21 | -Wno-implicit-fallthrough 22 | -Woverloaded-virtual 23 | -Wpointer-arith 24 | -Wshadow 25 | -Wwrite-strings 26 | -march=native 27 | # -MMD 28 | -std=c++11 29 | -rdynamic 30 | ) 31 | 32 | if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") 33 | list(APPEND CXX_FLAGS "-Wno-null-dereference") 34 | list(APPEND CXX_FLAGS "-Wno-sign-conversion") 35 | list(APPEND CXX_FLAGS "-Wno-unused-local-typedef") 36 | list(APPEND CXX_FLAGS "-Wthread-safety") 37 | list(REMOVE_ITEM CXX_FLAGS "-rdynamic") 38 | endif() 39 | string(REPLACE ";" " " CMAKE_CXX_FLAGS "${CXX_FLAGS}") 40 | 41 | set(CMAKE_CXX_FLAGS_DEBUG "-O0") 42 | set(CMAKE_CXX_FLAGS_RELEASE "-O2 -DNDEBUG") 43 | set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) 44 | set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib) 45 | 46 | INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}) 47 | INCLUDE_DIRECTORIES(src) 48 | INCLUDE_DIRECTORIES(include) 49 | INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/third_party/include) 50 | LINK_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/third_party/lib) 51 | 52 | string(TOUPPER ${CMAKE_BUILD_TYPE} BUILD_TYPE) 53 | message(STATUS "CXX_FLAGS = " ${CMAKE_CXX_FLAGS} " " ${CMAKE_CXX_FLAGS_${BUILD_TYPE}}) 54 | 55 | include(libraft.cmake) 56 | include(libraft-tests.cmake) 57 | #include(liblibraft-examples.cmake) 58 | 59 | #install(TARGETS libraft DESTINATION lib) 60 | 61 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2018 Ant Financial Services Group Co., Ltd. 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # libraft 2 | ![License](https://img.shields.io/badge/license-Apache--2.0-green.svg) 3 | 4 | 5 | ## Overview 6 | libraft is a C++ raft library,based on the [RAFT](https://raft.github.io/) consistency algorithm,inspired by [etcd/raft](https://github.com/etcd-io/etcd/tree/master/raft). 7 | 8 | ## Features 9 | - Leader election and priority-based semi-deterministic leader election 10 | - Cluster membership management, adding nodes, removing nodes, replacing nodes, etc. 11 | - Mechanism of transfer leader for reboot, load balance scene, etc. 12 | - Symmetric network partition tolerance 13 | - Asymmetric network partition tolerance 14 | - Fault tolerance, minority failure doesn't affect the overall availability of system 15 | - Manual recovery cluster available for majority failure 16 | - Linearizable read, ReadIndex/LeaseRead 17 | - Replication pipeline 18 | 19 | ## Dependencies 20 | - CMake: >=2.6 21 | - Protobuf: >=1.8.0 22 | - gtest: >=3.14.0 23 | 24 | ## Build 25 | 26 | ``` 27 | # first build dependencies(if system include these dependencies,just ignore it) 28 | ./bootstap.sh 29 | 30 | # second build libraft 31 | mkdir build 32 | cd build 33 | cmake .. 34 | make 35 | 36 | # run tests 37 | ./bin/libraft_test 38 | ``` 39 | 40 | ## Documents 41 | 42 | 43 | ## Contribution 44 | 45 | 46 | ## Acknowledgement 47 | libraft was ported from Etcd's raft module [etcd/raft](https://github.com/etcd-io/etcd/tree/master/raft) with some optimizing and improvement. Thanks to the etcd team for opening up such a great RAFT implementation. 48 | 49 | ## License 50 | libraft is licensed under the [Apache License 2.0](./LICENSE). -------------------------------------------------------------------------------- /bootstap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | #!/bin/sh 4 | 5 | pwd=`pwd` 6 | third_party=${pwd}/third_party 7 | rm -fr ${third_party} 8 | mkdir -p ${third_party} 9 | 10 | protobuf=protobuf-3.14.0 11 | gtest=googletest-release-1.8.0 12 | libevent=libevent-2.1.12-stable 13 | 14 | mkdir -p ${third_party}/include 15 | mkdir -p ${third_party}/lib 16 | mkdir -p ${third_party}/bin 17 | 18 | cd $pwd/deps 19 | echo "compile ${gtest}..." 20 | rm -fr ${gtest} 21 | tar xvf ${gtest}.tar.gz 22 | cd ${gtest} 23 | cmake . 24 | make -j8 25 | cp ./googlemock/libgmock* ${third_party}/lib/ 26 | cp ./googlemock/gtest/libgtest* ${third_party}/lib/ 27 | cp -r googlemock/include/gmock ${third_party}/include/ 28 | cp -r googletest/include/gtest ${third_party}/include/ 29 | cd ../ 30 | rm -fr ${gtest} 31 | echo "compile ${gtest} done" 32 | 33 | cd $pwd/deps 34 | echo "compile ${protobuf}..." 35 | rm -fr ${protobuf} 36 | tar xvf ${protobuf}.tar.gz 37 | cd ${protobuf} 38 | ./autogen.sh 39 | ./configure --prefix=${third_party} 40 | make -j6 41 | make install 42 | cd .. 43 | rm -fr ${protobuf} 44 | echo "compile ${protobuf} done" 45 | 46 | cd ${pwd} 47 | 48 | cd $pwd/deps 49 | echo "compile ${libevent}..." 50 | rm -fr ${libevent} 51 | tar xvf ${libevent}.tar.gz 52 | cd ${libevent} 53 | ./autogen.sh 54 | ./configure --prefix=${third_party} --disable-openssl 55 | make -j6 56 | make install 57 | cd .. 58 | rm -fr ${libevent} 59 | echo "compile ${libevent} done" 60 | 61 | cd ${pwd} 62 | -------------------------------------------------------------------------------- /deps/googletest-release-1.8.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lichuang/libraft/58568917c24a2df245043db8f259035c18c0eaef/deps/googletest-release-1.8.0.tar.gz -------------------------------------------------------------------------------- /deps/libevent-2.1.12-stable.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lichuang/libraft/58568917c24a2df245043db8f259035c18c0eaef/deps/libevent-2.1.12-stable.tar.gz -------------------------------------------------------------------------------- /deps/protobuf-3.14.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lichuang/libraft/58568917c24a2df245043db8f259035c18c0eaef/deps/protobuf-3.14.0.tar.gz -------------------------------------------------------------------------------- /include/libraft.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #ifndef __LIB_RAFT_H__ 6 | #define __LIB_RAFT_H__ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "proto/raft.pb.h" 13 | 14 | using namespace std; 15 | using namespace raftpb; 16 | 17 | namespace libraft { 18 | 19 | const static uint64_t kEmptyPeerId = 0; 20 | const static uint64_t kNoLimit = ULONG_MAX; 21 | 22 | enum ErrorCode { 23 | OK = 0, 24 | 25 | // ErrCompacted is returned by Storage.Entries/Compact when a requested 26 | // index is unavailable because it predates the last snapshot. 27 | ErrCompacted = 1, 28 | 29 | // ErrSnapOutOfDate is returned by Storage.CreateSnapshot when a requested 30 | // index is older than the existing snapshot. 31 | ErrSnapOutOfDate = 2, 32 | 33 | // ErrUnavailable is returned by Storage interface when the requested log entries 34 | // are unavailable. 35 | ErrUnavailable = 3, 36 | 37 | // ErrSnapshotTemporarilyUnavailable is returned by the Storage interface when the required 38 | // snapshot is temporarily unavailable. 39 | ErrSnapshotTemporarilyUnavailable = 4, 40 | 41 | // ErrSerializeFail is returned by the Node interface when the request data Serialize failed. 42 | ErrSerializeFail = 5, 43 | 44 | // Number of error code 45 | NumErrorCode 46 | }; 47 | 48 | static const char* 49 | kErrString[NumErrorCode] = { 50 | "OK", 51 | "ErrCompacted", 52 | "ErrSnapOutOfDate", 53 | "ErrUnavailable", 54 | "ErrSnapshotTemporarilyUnavailable", 55 | "ErrSerializeFail", 56 | }; 57 | 58 | inline const char* 59 | GetErrorString(int err) { 60 | return kErrString[err]; 61 | } 62 | 63 | inline bool SUCCESS(int err) { return err == OK; } 64 | 65 | enum StateType { 66 | StateFollower = 0, 67 | StateCandidate = 1, 68 | StateLeader = 2, 69 | StatePreCandidate = 3, 70 | NumStateType 71 | }; 72 | 73 | struct SoftState { 74 | uint64_t leader; 75 | StateType state; 76 | 77 | SoftState() 78 | : leader(kEmptyPeerId) 79 | , state(StateFollower) {} 80 | 81 | inline SoftState& operator=(const SoftState& from) { 82 | leader = from.leader; 83 | state = from.state; 84 | return *this; 85 | } 86 | }; 87 | 88 | // ReadState provides state for read only query. 89 | // It's caller's responsibility to call ReadIndex first before getting 90 | // this state from ready, It's also caller's duty to differentiate if this 91 | // state is what it requests through RequestCtx, eg. given a unique id as 92 | // RequestCtx 93 | struct ReadState { 94 | uint64_t index; 95 | string requestCtx; 96 | ReadState(uint64_t i, const string &ctx) 97 | : index(i), 98 | requestCtx(ctx) {} 99 | }; 100 | 101 | typedef vector EntryVec; 102 | typedef vector MessageVec; 103 | 104 | // Storage is an interface that may be implemented by the application 105 | // to retrieve log entries from storage. 106 | // 107 | // If any Storage method returns an error, the raft instance will 108 | // become inoperable and refuse to participate in elections; the 109 | // application is responsible for cleanup and recovery in this case. 110 | class Storage { 111 | public: 112 | virtual ~Storage() {} 113 | 114 | // InitialState returns the saved HardState and ConfState information. 115 | virtual int InitialState(HardState *, ConfState *) = 0; 116 | 117 | // FirstIndex returns the index of the first log entry that is 118 | // possibly available via Entries (older entries have been incorporated 119 | // into the latest Snapshot; if storage only contains the dummy entry the 120 | // first log entry is not available). 121 | virtual int FirstIndex(uint64_t *index) = 0; 122 | 123 | // LastIndex returns the index of the last entry in the log. 124 | virtual int LastIndex(uint64_t *index) = 0; 125 | 126 | // Term returns the term of entry i, which must be in the range 127 | // [FirstIndex()-1, LastIndex()]. The term of the entry before 128 | // FirstIndex is retained for matching purposes even though the 129 | // rest of that entry may not be available. 130 | virtual int Term(uint64_t i, uint64_t *term) = 0; 131 | 132 | // Entries returns a slice of log entries in the range [lo,hi). 133 | // MaxSize limits the total size of the log entries returned, but 134 | // Entries returns at least one entry if any. 135 | virtual int Entries(uint64_t lo, uint64_t hi, uint64_t maxSize, EntryVec *entries) = 0; 136 | 137 | // Snapshot returns the most recent snapshot. 138 | // If snapshot is temporarily unavailable, it should return ErrSnapshotTemporarilyUnavailable, 139 | // so raft state machine could know that Storage needs some time to prepare 140 | // snapshot and call Snapshot later. 141 | virtual int GetSnapshot(Snapshot **snapshot) = 0; 142 | 143 | //int SetHardState(const HardState& ); 144 | //virtual int Append(const EntryVec& entries) = 0; 145 | //virtual int CreateSnapshot(uint64_t i, ConfState *cs, const string& data, Snapshot *ss) = 0; 146 | }; 147 | 148 | // ReadOnlyOption specifies how the read only request is processed. 149 | enum ReadOnlyOption { 150 | // ReadOnlySafe guarantees the linearizability of the read only request by 151 | // communicating with the quorum. It is the default and suggested option. 152 | ReadOnlySafe = 0, 153 | 154 | // ReadOnlyLeaseBased ensures linearizability of the read only request by 155 | // relying on the leader lease. It can be affected by clock drift. 156 | // If the clock drift is unbounded, leader might keep the lease longer than it 157 | // should (clock can move backward/pause without any bound). ReadIndex is not safe 158 | // in that case. 159 | ReadOnlyLeaseBased = 1 160 | }; 161 | 162 | // StateMachine is the sink of all the events of a very raft node. 163 | // Implement a specific StateMachine by application. 164 | // 165 | // NOTE: All the interfaces are not guaranteed to be thread safe and they are 166 | // called sequentially, saying that every single operation will block all the 167 | // following ones. 168 | class StateMachine { 169 | public: 170 | virtual ~StateMachine(); 171 | 172 | virtual void on_soft_state_changed(const SoftState&) = 0; 173 | 174 | virtual void on_send_message(const MessageVec&) = 0; 175 | }; 176 | 177 | typedef void (*raft_log_func)(const char * buf); 178 | 179 | enum LogLevel { 180 | Debug = 0, 181 | Warn = 1, 182 | Info = 2, 183 | Error = 3, 184 | Fatal = 4, 185 | }; 186 | 187 | // Config contains the parameters to start a raft. 188 | struct Config { 189 | // ID is the identity of the local raft. ID cannot be 0. 190 | uint64_t id = kEmptyPeerId; 191 | 192 | // peers contains the IDs of all nodes (including self) in the raft cluster. It 193 | // should only be set when starting a new raft cluster. Restarting raft from 194 | // previous configuration will panic if peers is set. peer is private and only 195 | // used for testing right now. 196 | vector peers; 197 | 198 | // electionTick is the number of Node.Tick invocations that must pass between 199 | // elections. That is, if a follower does not receive any message from the 200 | // leader of current term before ElectionTick has elapsed, it will become 201 | // candidate and start an election. ElectionTick must be greater than 202 | // HeartbeatTick. We suggest ElectionTick = 10 * HeartbeatTick to avoid 203 | // unnecessary leader switching. 204 | int electionTick = 10; 205 | 206 | // heartbeatTick is the number of Node.Tick invocations that must pass between 207 | // heartbeats. That is, a leader sends heartbeat messages to maintain its 208 | // leadership every HeartbeatTick ticks. 209 | int heartbeatTick = 1; 210 | 211 | // storage is the storage for raft. raft generates entries and states to be 212 | // stored in storage. raft reads the persisted entries and states out of 213 | // Storage when it needs. raft reads out the previous state and configuration 214 | // out of storage when restarting. 215 | // when node end up, storage will be destroyed. 216 | // if it is NULL, use `MemoryStorage' by default. 217 | Storage* storage = NULL; 218 | 219 | // applied is the last applied index. It should only be set when restarting 220 | // raft. raft will not return entries to the application smaller or equal to 221 | // Applied. If Applied is unset when restarting, raft might return previous 222 | // applied entries. This is a very application dependent configuration. 223 | uint64_t applied = 0; 224 | 225 | // maxSizePerMsg limits the max size of each append message. Smaller value 226 | // lowers the raft recovery cost(initial probing and message lost during normal 227 | // operation). On the other side, it might affect the throughput during normal 228 | // replication. Note: math.MaxUint64 for unlimited, 0 for at most one entry per 229 | // message. 230 | uint64_t maxSizePerMsg = 1024 * 1024; 231 | 232 | // maxInflightMsgs limits the max number of in-flight append messages during 233 | // optimistic replication phase. The application transportation layer usually 234 | // has its own sending buffer over TCP/UDP. Setting MaxInflightMsgs to avoid 235 | // overflowing that sending buffer. TODO (xiangli): feedback to application to 236 | // limit the proposal rate? 237 | uint64_t maxInflightMsgs = 1024; 238 | 239 | // checkQuorum specifies if the leader should check quorum activity. Leader 240 | // steps down when quorum is not active for an electionTimeout. 241 | bool checkQuorum = false; 242 | 243 | // preVote enables the Pre-Vote algorithm described in raft thesis section 244 | // 9.6. This prevents disruption when a node that has been partitioned away 245 | // rejoins the cluster. 246 | bool preVote = false; 247 | 248 | // log level of raft log, Debug by default 249 | LogLevel logLevel = Debug; 250 | 251 | // logFunc is the logger function used for raft log. For multinode which can host 252 | // multiple raft group, each raft group can have its own logger. 253 | // when node end up, storage will be destroyed. 254 | // if it is NULL, the default logger will send log to stdout. 255 | raft_log_func logFunc = NULL; 256 | 257 | // the state machine implemented by the application 258 | StateMachine* fsm = NULL; 259 | 260 | ReadOnlyOption readOnlyOption; 261 | }; 262 | 263 | struct Peer { 264 | uint64_t Id; 265 | string Context; 266 | }; 267 | 268 | class Node { 269 | public: 270 | // Tick increments the internal logical clock for the Node by a single tick. Election 271 | // timeouts and heartbeat timeouts are in units of ticks. 272 | virtual void Tick() = 0; 273 | 274 | // Campaign causes the Node to transition to candidate state and start campaigning to become leader. 275 | virtual int Campaign() = 0; 276 | 277 | // Propose proposes that data be appended to the log. 278 | virtual int Propose(const string& data) = 0; 279 | 280 | // ProposeConfChange proposes config change. 281 | // At most one ConfChange can be in the process of going through consensus. 282 | // Application needs to call ApplyConfChange when applying EntryConfChange type entry. 283 | virtual int ProposeConfChange(const ConfChange& cc) = 0; 284 | 285 | // Step advances the state machine using the given message. ctx.Err() will be returned, if any. 286 | virtual int Step(const Message& msg) = 0; 287 | 288 | // Advance notifies the Node that the application has saved progress up to the last Ready. 289 | // It prepares the node to return the next available Ready. 290 | // 291 | // The application should generally call Advance after it applies the entries in last Ready. 292 | // 293 | // However, as an optimization, the application may call Advance while it is applying the 294 | // commands. For example. when the last Ready contains a snapshot, the application might take 295 | // a long time to apply the snapshot data. To continue receiving Ready without blocking raft 296 | // progress, it can call Advance before finishing applying the last ready. 297 | virtual void Advance() = 0; 298 | 299 | // ApplyConfChange applies config change to the local node. 300 | // Returns an opaque ConfState protobuf which must be recorded 301 | // in snapshots. Will never return nil; it returns a pointer only 302 | // to match MemoryStorage.Compact. 303 | virtual void ApplyConfChange(const ConfChange& cc, ConfState *cs) = 0; 304 | 305 | // TransferLeadership attempts to transfer leadership to the given transferee. 306 | virtual void TransferLeadership(uint64_t leader, uint64_t transferee) = 0; 307 | 308 | // ReadIndex request a read state. The read state will be set in the ready. 309 | // Read state has a read index. Once the application advances further than the read 310 | // index, any linearizable read requests issued before the read request can be 311 | // processed safely. The read state will have the same rctx attached. 312 | virtual int ReadIndex(const string &rctx) = 0; 313 | 314 | // Stop performs any necessary termination of the Node. 315 | virtual void Stop() = 0; 316 | }; 317 | 318 | extern Node* StartNode(Config *config, const vector& peers); 319 | extern Node* RestartNode(Config *config); 320 | 321 | // empty (hard,soft) state constants 322 | static const HardState kEmptyHardState; 323 | static const SoftState kEmptySoftState; 324 | 325 | }; // namespace libraft 326 | 327 | #endif // __LIB_RAFT_H__ 328 | -------------------------------------------------------------------------------- /libraft-tests.cmake: -------------------------------------------------------------------------------- 1 | 2 | add_executable ( libraft_test 3 | test/crc32c_test.cc 4 | test/io_buffer_test.cc 5 | test/log_test.cc 6 | test/main.cc 7 | test/memory_storage_test.cc 8 | test/node_test.cc 9 | test/progress_test.cc 10 | #test/record_test.cc 11 | test/raft_flow_controller_test.cc 12 | test/raft_paper_test.cc 13 | test/raft_snap_test.cc 14 | test/raft_test_util.cc 15 | test/raft_test.cc 16 | test/unstable_log_test.cc 17 | ) 18 | 19 | target_link_libraries (libraft_test PRIVATE raft gtest pthread protobuf) -------------------------------------------------------------------------------- /libraft.cmake: -------------------------------------------------------------------------------- 1 | set(libraft_files 2 | src/proto/raft.pb.cc 3 | src/proto/record.pb.cc 4 | 5 | src/base/crc32c.cc 6 | src/base/file_system_adaptor.cc 7 | src/base/io_buffer.cc 8 | src/base/logger.cc 9 | src/base/mutex.cc 10 | src/base/util.cc 11 | 12 | src/core/node.cc 13 | src/core/fsm_caller.cc 14 | src/core/progress.cc 15 | src/core/raft.cc 16 | src/core/read_only.cc 17 | 18 | src/io/buffer_io_reader.cc 19 | 20 | src/storage/log.cc 21 | src/storage/memory_storage.cc 22 | src/storage/unstable_log.cc 23 | 24 | src/wal/decoder.cc 25 | src/wal/encoder.cc 26 | src/wal/wal.cc 27 | ) 28 | 29 | add_library(raft 30 | ${raft_SHARED_OR_STATIC} 31 | ${libraft_files} 32 | ) 33 | -------------------------------------------------------------------------------- /src/base/crc32c.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. See the AUTHORS file for names of contributors. 4 | // 5 | // A portable implementation of crc32c, optimized to handle 6 | // four bytes at a time. 7 | 8 | #include "base/crc32c.h" 9 | 10 | #include 11 | 12 | namespace libraft { 13 | 14 | static const uint32_t table0_[256] = { 15 | 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 16 | 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, 17 | 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, 18 | 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, 19 | 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, 20 | 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, 21 | 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 22 | 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, 23 | 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, 24 | 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, 25 | 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, 26 | 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, 27 | 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 28 | 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a, 29 | 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, 30 | 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, 31 | 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, 32 | 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, 33 | 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 34 | 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, 35 | 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, 36 | 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, 37 | 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, 38 | 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, 39 | 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 40 | 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, 41 | 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, 42 | 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, 43 | 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, 44 | 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, 45 | 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, 46 | 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, 47 | 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, 48 | 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, 49 | 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, 50 | 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, 51 | 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 52 | 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, 53 | 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, 54 | 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, 55 | 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, 56 | 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, 57 | 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 58 | 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982, 59 | 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, 60 | 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, 61 | 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, 62 | 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, 63 | 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 64 | 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, 65 | 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, 66 | 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, 67 | 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, 68 | 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, 69 | 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 70 | 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, 71 | 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, 72 | 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, 73 | 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, 74 | 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, 75 | 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, 76 | 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, 77 | 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, 78 | 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351 79 | }; 80 | static const uint32_t table1_[256] = { 81 | 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, 82 | 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945, 83 | 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, 84 | 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd, 85 | 0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918, 86 | 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4, 87 | 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, 88 | 0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c, 89 | 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, 90 | 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47, 91 | 0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823, 92 | 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff, 93 | 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, 94 | 0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6, 95 | 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, 96 | 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e, 97 | 0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d, 98 | 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41, 99 | 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, 100 | 0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9, 101 | 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, 102 | 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0, 103 | 0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4, 104 | 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78, 105 | 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, 106 | 0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43, 107 | 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, 108 | 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb, 109 | 0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e, 110 | 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2, 111 | 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, 112 | 0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a, 113 | 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, 114 | 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc, 115 | 0x66d73941, 0x7575a136, 0x419209af, 0x523091d8, 116 | 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004, 117 | 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, 118 | 0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d, 119 | 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, 120 | 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185, 121 | 0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162, 122 | 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be, 123 | 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, 124 | 0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306, 125 | 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, 126 | 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f, 127 | 0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b, 128 | 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287, 129 | 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, 130 | 0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8, 131 | 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, 132 | 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600, 133 | 0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5, 134 | 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439, 135 | 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, 136 | 0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781, 137 | 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, 138 | 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba, 139 | 0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de, 140 | 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502, 141 | 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, 142 | 0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b, 143 | 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, 144 | 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483 145 | }; 146 | static const uint32_t table2_[256] = { 147 | 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, 148 | 0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469, 149 | 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, 150 | 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac, 151 | 0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9, 152 | 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3, 153 | 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, 154 | 0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726, 155 | 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, 156 | 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d, 157 | 0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2, 158 | 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8, 159 | 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, 160 | 0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7, 161 | 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, 162 | 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32, 163 | 0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa, 164 | 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0, 165 | 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, 166 | 0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75, 167 | 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, 168 | 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a, 169 | 0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5, 170 | 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff, 171 | 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, 172 | 0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4, 173 | 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, 174 | 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161, 175 | 0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634, 176 | 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e, 177 | 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, 178 | 0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb, 179 | 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, 180 | 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a, 181 | 0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5, 182 | 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def, 183 | 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, 184 | 0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0, 185 | 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, 186 | 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065, 187 | 0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24, 188 | 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e, 189 | 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, 190 | 0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb, 191 | 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, 192 | 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4, 193 | 0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b, 194 | 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71, 195 | 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, 196 | 0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3, 197 | 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, 198 | 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36, 199 | 0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63, 200 | 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79, 201 | 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, 202 | 0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc, 203 | 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, 204 | 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7, 205 | 0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238, 206 | 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622, 207 | 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, 208 | 0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d, 209 | 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, 210 | 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8 211 | }; 212 | static const uint32_t table3_[256] = { 213 | 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, 214 | 0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca, 215 | 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, 216 | 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c, 217 | 0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804, 218 | 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7, 219 | 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, 220 | 0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11, 221 | 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, 222 | 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41, 223 | 0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54, 224 | 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7, 225 | 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, 226 | 0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c, 227 | 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, 228 | 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a, 229 | 0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de, 230 | 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d, 231 | 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, 232 | 0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb, 233 | 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, 234 | 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610, 235 | 0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405, 236 | 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6, 237 | 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, 238 | 0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6, 239 | 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, 240 | 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040, 241 | 0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368, 242 | 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b, 243 | 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, 244 | 0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d, 245 | 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, 246 | 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5, 247 | 0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0, 248 | 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213, 249 | 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, 250 | 0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8, 251 | 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, 252 | 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e, 253 | 0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d, 254 | 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e, 255 | 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, 256 | 0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698, 257 | 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, 258 | 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443, 259 | 0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656, 260 | 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5, 261 | 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, 262 | 0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12, 263 | 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, 264 | 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4, 265 | 0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc, 266 | 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f, 267 | 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, 268 | 0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9, 269 | 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, 270 | 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99, 271 | 0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c, 272 | 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f, 273 | 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, 274 | 0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4, 275 | 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, 276 | 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842 277 | }; 278 | 279 | // Lower-level versions of Get... that read directly from a character buffer 280 | // without any bounds checking. 281 | 282 | static inline uint32_t DecodeFixed32(const char* ptr) { 283 | #if defined(ARCH_CPU_LITTLE_ENDIAN) && ARCH_CPU_LITTLE_ENDIAN 284 | // Load the raw bytes 285 | uint32_t result; 286 | memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load 287 | return result; 288 | #else 289 | return ((static_cast(static_cast(ptr[0]))) 290 | | (static_cast(static_cast(ptr[1])) << 8) 291 | | (static_cast(static_cast(ptr[2])) << 16) 292 | | (static_cast(static_cast(ptr[3])) << 24)); 293 | #endif 294 | } 295 | 296 | inline uint64_t DecodeFixed64(const char* ptr) { 297 | #if defined(ARCH_CPU_LITTLE_ENDIAN) && ARCH_CPU_LITTLE_ENDIAN 298 | // Load the raw bytes 299 | uint64_t result; 300 | memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load 301 | return result; 302 | #else 303 | uint64_t lo = DecodeFixed32(ptr); 304 | uint64_t hi = DecodeFixed32(ptr + 4); 305 | return (hi << 32) | lo; 306 | #endif 307 | } 308 | 309 | // Used to fetch a naturally-aligned 32-bit word in little endian byte-order 310 | static inline uint32_t LE_LOAD32(const uint8_t *p) { 311 | return DecodeFixed32(reinterpret_cast(p)); 312 | } 313 | 314 | uint32_t Extend(uint32_t crc, const char* buf, size_t size) { 315 | const uint8_t *p = reinterpret_cast(buf); 316 | const uint8_t *e = p + size; 317 | uint32_t l = crc ^ 0xffffffffu; 318 | 319 | #define STEP1 do { \ 320 | int c = (l & 0xff) ^ *p++; \ 321 | l = table0_[c] ^ (l >> 8); \ 322 | } while (0) 323 | #define STEP4 do { \ 324 | uint32_t c = l ^ LE_LOAD32(p); \ 325 | p += 4; \ 326 | l = table3_[c & 0xff] ^ \ 327 | table2_[(c >> 8) & 0xff] ^ \ 328 | table1_[(c >> 16) & 0xff] ^ \ 329 | table0_[c >> 24]; \ 330 | } while (0) 331 | 332 | // Point x at first 4-byte aligned byte in string. This might be 333 | // just past the end of the string. 334 | const uintptr_t pval = reinterpret_cast(p); 335 | const uint8_t* x = reinterpret_cast(((pval + 3) >> 2) << 2); 336 | if (x <= e) { 337 | // Process bytes until finished or p is 4-byte aligned 338 | while (p != x) { 339 | STEP1; 340 | } 341 | } 342 | // Process bytes 16 at a time 343 | while ((e-p) >= 16) { 344 | STEP4; STEP4; STEP4; STEP4; 345 | } 346 | // Process bytes 4 at a time 347 | while ((e-p) >= 4) { 348 | STEP4; 349 | } 350 | // Process the last few bytes 351 | while (p != e) { 352 | STEP1; 353 | } 354 | #undef STEP4 355 | #undef STEP1 356 | return l ^ 0xffffffffu; 357 | } 358 | 359 | } // namespace libraft 360 | -------------------------------------------------------------------------------- /src/base/crc32c.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. See the AUTHORS file for names of contributors. 4 | 5 | #ifndef __LIBRAFT_BASE_CRC32C_H__ 6 | #define __LIBRAFT_BASE_CRC32C_H__ 7 | 8 | #include 9 | #include 10 | 11 | namespace libraft { 12 | 13 | // Return the crc32c of concat(A, data[0,n-1]) where init_crc is the 14 | // crc32c of some string A. Extend() is often used to maintain the 15 | // crc32c of a stream of data. 16 | extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n); 17 | 18 | // Return the crc32c of data[0,n-1] 19 | inline uint32_t Value(const char* data, size_t n) { 20 | return Extend(0, data, n); 21 | } 22 | 23 | static const uint32_t kMaskDelta = 0xa282ead8ul; 24 | 25 | // Return a masked representation of crc. 26 | // 27 | // Motivation: it is problematic to compute the CRC of a string that 28 | // contains embedded CRCs. Therefore we recommend that CRCs stored 29 | // somewhere (e.g., in files) should be masked before being stored. 30 | inline uint32_t Mask(uint32_t crc) { 31 | // Rotate right by 15 bits and add a constant. 32 | return ((crc >> 15) | (crc << 17)) + kMaskDelta; 33 | } 34 | 35 | // Return the crc whose masked representation is masked_crc. 36 | inline uint32_t Unmask(uint32_t masked_crc) { 37 | uint32_t rot = masked_crc - kMaskDelta; 38 | return ((rot >> 17) | (rot << 15)); 39 | } 40 | 41 | } // namespace libraft 42 | 43 | #endif // __LIBRAFT_BASE_CRC32C_H__ 44 | -------------------------------------------------------------------------------- /src/base/file.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #ifndef __LIBRAFT_FILE_H__ 6 | #define __LIBRAFT_FILE_H__ 7 | 8 | #include 9 | #include "base/io_error.h" 10 | 11 | using namespace std; 12 | 13 | namespace libraft { 14 | 15 | class File { 16 | public: 17 | File(const string& path); 18 | 19 | int ReadInt64(int64_t* ret); 20 | int WriteUint64(uint64_t n); 21 | 22 | int ReadFull(char* data); 23 | int Write(const string& data); 24 | 25 | int Flush(); 26 | }; 27 | 28 | }; // namespace libraft 29 | 30 | #endif // __LIBRAFT_FILE_H__ -------------------------------------------------------------------------------- /src/base/file_system_adaptor.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #include 6 | #include 7 | #include "base/file_system_adaptor.h" 8 | 9 | namespace libraft { 10 | bool 11 | PosixFileSystemAdaptor::PathExists(const string& path) { 12 | return access(path.c_str(), F_OK) == 0; 13 | } 14 | 15 | bool 16 | PosixFileSystemAdaptor::DirectoryExists(const string& dir) { 17 | struct stat buf; 18 | if (::stat(dir.c_str(), &buf) == 0) { 19 | return S_ISDIR(buf.st_mode); 20 | } 21 | return false; 22 | } 23 | 24 | bool 25 | PosixFileSystemAdaptor::DeleteFile(const string& path, bool recursive) { 26 | return true; 27 | } 28 | 29 | }; // namespace libraft -------------------------------------------------------------------------------- /src/base/file_system_adaptor.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #ifndef __LIBRAFT_FILE_SYSTEM_ADAPTOR_H__ 6 | #define __LIBRAFT_FILE_SYSTEM_ADAPTOR_H__ 7 | 8 | #include 9 | 10 | using namespace std; 11 | 12 | namespace libraft { 13 | 14 | class FileSystemAdaptor { 15 | public: 16 | FileSystemAdaptor(); 17 | virtual ~FileSystemAdaptor() {} 18 | 19 | // Determine whether a given path exists 20 | virtual bool PathExists(const string& path) = 0; 21 | 22 | // Determine whether a given directory exists 23 | virtual bool DirectoryExists(const string& dir) = 0; 24 | 25 | // Deletes the given path, whether it's a file or a directory. 26 | virtual bool DeleteFile(const string& path,bool recursive) = 0; 27 | 28 | // 29 | virtual bool CreateDirectory(const string& path) = 0; 30 | 31 | virtual string Join(const string&dir, const string&name) = 0; 32 | }; 33 | 34 | class PosixFileSystemAdaptor : public FileSystemAdaptor { 35 | public: 36 | PosixFileSystemAdaptor(); 37 | virtual ~PosixFileSystemAdaptor() {} 38 | 39 | virtual bool PathExists(const string& path); 40 | 41 | virtual bool DirectoryExists(const string& dir); 42 | 43 | virtual bool DeleteFile(const string& path, bool recursive); 44 | 45 | virtual bool CreateDirectory(const string& path); 46 | 47 | virtual string Join(const string&dir, const string&name); 48 | }; 49 | 50 | }; // namespace libraft 51 | 52 | #endif // __LIBRAFT_FILE_SYSTEM_ADAPTOR_H__ -------------------------------------------------------------------------------- /src/base/io_buffer.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #include 6 | #include "base/io_buffer.h" 7 | #include "base/io_error.h" 8 | 9 | namespace libraft { 10 | 11 | static const uint32_t kBlockSize = 1024; 12 | 13 | struct Block { 14 | char buffer[kBlockSize]; 15 | uint32_t read_pos; 16 | uint32_t write_pos; 17 | Block* next; 18 | 19 | uint32_t WriteSize() { return kBlockSize - write_pos; } 20 | 21 | uint32_t ReadSize() { 22 | if (write_pos <= read_pos) { 23 | return 0; 24 | } 25 | return write_pos - read_pos; 26 | } 27 | 28 | void AdvanceWrite(uint32_t wpos) { write_pos += wpos;} 29 | 30 | char* WritePos() { return buffer + write_pos; } 31 | 32 | void AdvanceRead(uint32_t rpos) { read_pos += rpos;} 33 | 34 | char* ReadPos() { return buffer + read_pos; } 35 | 36 | Block() 37 | : read_pos(0), 38 | write_pos(), 39 | next(NULL) { 40 | } 41 | 42 | ~Block() { 43 | if (next) { 44 | delete next; 45 | } 46 | } 47 | }; 48 | 49 | IOBuffer::IOBuffer() 50 | : head_(new Block()), 51 | read_block_(head_), 52 | write_block_(head_) { 53 | 54 | } 55 | 56 | IOBuffer::~IOBuffer() { 57 | delete head_; 58 | } 59 | 60 | void 61 | IOBuffer::ensureMemory(uint32_t size) { 62 | if (write_block_ && write_block_->WriteSize() > size) { 63 | return; 64 | } 65 | 66 | Block* block = write_block_; 67 | while (size > 0) { 68 | block->next = new Block(); 69 | block = block->next; 70 | size -= kBlockSize; 71 | } 72 | } 73 | 74 | int 75 | IOBuffer::ReadFull(char* data, uint32_t bufsize, int* err) { 76 | *err = kOK; 77 | uint32_t offset = 0, size, total = 0; 78 | while (read_block_) { 79 | size = read_block_->ReadSize(); 80 | if (size == 0) { 81 | goto out; 82 | } 83 | memcpy(data + offset, read_block_->ReadPos(), size); 84 | read_block_->AdvanceRead(size); 85 | offset += size; 86 | total += size; 87 | read_block_ = read_block_->next; 88 | } 89 | 90 | out: 91 | if (total < bufsize) { 92 | *err = (total == 0) ? kEOF : kErrUnexpectedEOF; 93 | } 94 | 95 | return total; 96 | } 97 | 98 | class MemoryBuffer : public IOBuffer { 99 | public: 100 | MemoryBuffer(); 101 | MemoryBuffer(const string& data); 102 | virtual ~MemoryBuffer(); 103 | 104 | int ReadInt64(int64_t* ret); 105 | int WriteUint64(uint64_t n); 106 | 107 | void Append(const string& data); 108 | private: 109 | 110 | }; 111 | 112 | MemoryBuffer::MemoryBuffer() 113 | : IOBuffer() { 114 | } 115 | 116 | MemoryBuffer::MemoryBuffer(const string& data) 117 | : IOBuffer() { 118 | Append(data); 119 | } 120 | 121 | MemoryBuffer::~MemoryBuffer() { 122 | 123 | } 124 | 125 | void 126 | MemoryBuffer::Append(const string& data) { 127 | ensureMemory(data.size()); 128 | uint32_t size = data.size(); 129 | uint32_t offset = 0; 130 | do { 131 | if (write_block_->WriteSize() == 0) { 132 | write_block_ = write_block_->next; 133 | } 134 | uint32_t wsize = write_block_->WriteSize() >= size ? size : write_block_->WriteSize(); 135 | memcpy(write_block_->WritePos(), data.c_str() + offset, wsize); 136 | write_block_->AdvanceWrite(wsize); 137 | size -= wsize; 138 | } while (size > 0); 139 | } 140 | 141 | int 142 | MemoryBuffer::ReadInt64(int64_t* ret) { 143 | memcpy(ret, read_block_->ReadPos(), sizeof(int64_t)); 144 | read_block_->AdvanceRead(sizeof(int64_t)); 145 | return 0; 146 | } 147 | 148 | int 149 | MemoryBuffer::WriteUint64(uint64_t n) { 150 | ensureMemory(sizeof(uint64_t)); 151 | memcpy(&(write_block_->WritePos()[0]), &n, sizeof(uint64_t)); 152 | write_block_->AdvanceWrite(sizeof(uint64_t)); 153 | return 0; 154 | } 155 | 156 | IOBuffer* 157 | newMemoryBuffer() { 158 | return new MemoryBuffer(); 159 | } 160 | 161 | IOBuffer* 162 | newMemoryBufferWithString(const string& data) { 163 | return new MemoryBuffer(data); 164 | } 165 | 166 | }; // namespace libraft -------------------------------------------------------------------------------- /src/base/io_buffer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #ifndef __LIBRAFT_IO_BUFFER_H__ 6 | #define __LIBRAFT_IO_BUFFER_H__ 7 | 8 | #include 9 | 10 | using namespace std; 11 | 12 | namespace libraft { 13 | 14 | struct Block; 15 | 16 | class IOBuffer { 17 | protected: 18 | IOBuffer(); 19 | 20 | void ensureMemory(uint32_t size); 21 | public: 22 | virtual int ReadInt64(int64_t* ret) = 0; 23 | virtual int WriteUint64(uint64_t n) = 0; 24 | 25 | virtual void Append(const string& data) = 0; 26 | 27 | virtual ~IOBuffer(); 28 | 29 | int ReadFull(char* data, uint32_t size, int* err); 30 | /* 31 | 32 | int Write(const string& data); 33 | 34 | int Flush(); 35 | */ 36 | 37 | protected: 38 | Block *head_; 39 | Block* read_block_; 40 | Block* write_block_; 41 | }; 42 | 43 | extern IOBuffer* newMemoryBuffer(); 44 | extern IOBuffer* newMemoryBufferWithString(const string& data); 45 | 46 | }; // namespace libraft 47 | 48 | #endif // __LIBRAFT_IO_BUFFER_H__ -------------------------------------------------------------------------------- /src/base/io_error.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #ifndef __LIBRAFT_IO_ERROR_H__ 6 | #define __LIBRAFT_IO_ERROR_H__ 7 | 8 | namespace libraft { 9 | 10 | enum FileErrorCode { 11 | kOK = 0, 12 | kEOF = -1, 13 | kErrUnexpectedEOF = -2, 14 | }; 15 | 16 | }; // namespace libraft 17 | 18 | #endif // __LIBRAFT_IO_ERROR_H__ -------------------------------------------------------------------------------- /src/base/logger.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #include "base/logger.h" 6 | #include // for basename 7 | #include 8 | 9 | namespace libraft { 10 | 11 | LogLevel gLogLevel = Debug; 12 | 13 | static void default_logger(const char * buf); 14 | const static int kLogBufferSize = 1024; 15 | static raft_log_func gLogFunc = default_logger; 16 | 17 | const static char* kLogString[] = { 18 | "D", 19 | "W", 20 | "I", 21 | "E", 22 | "F", 23 | }; 24 | 25 | static void 26 | default_logger(const char * buf) { 27 | fprintf(stdout, "%s", buf); 28 | //fflush(stdout); 29 | } 30 | 31 | void 32 | do_log(LogLevel level, const char *file, int line, const char *fmt, ...) { 33 | va_list args; 34 | va_start(args, fmt); 35 | 36 | int n; 37 | char buf[kLogBufferSize] = {'\0'}; 38 | 39 | n = snprintf(buf, kLogBufferSize, "[%s:%d %s]", basename((char*)file), line, kLogString[level]); 40 | n += vsnprintf(buf + n, kLogBufferSize - n, fmt, args); 41 | va_end(args); 42 | 43 | if (n + 2 > kLogBufferSize) { 44 | return; 45 | } 46 | 47 | buf[n++] += '\n'; 48 | buf[n++] += '\0'; 49 | 50 | gLogFunc(buf); 51 | } 52 | 53 | }; // namespace libraft 54 | -------------------------------------------------------------------------------- /src/base/logger.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #ifndef __LIBRAFT_LOGGER_H__ 6 | #define __LIBRAFT_LOGGER_H__ 7 | 8 | #include 9 | #include 10 | #include "libraft.h" 11 | 12 | namespace libraft { 13 | 14 | void do_log(LogLevel level, const char *file, int line, const char *fmt, ...); 15 | 16 | extern LogLevel gLogLevel; 17 | 18 | #define Debugf(fmt, ...) if (gLogLevel <= Debug) do_log(Debug, __FILE__, __LINE__, fmt, ##__VA_ARGS__) 19 | #define Warnf(fmt, ...) if (gLogLevel <= Warn) do_log(Warn, __FILE__, __LINE__, fmt, ##__VA_ARGS__) 20 | #define Infof(fmt, ...) if (gLogLevel <= Info) do_log(Info, __FILE__, __LINE__, fmt, ##__VA_ARGS__) 21 | #define Errorf(fmt, ...) if (gLogLevel <= Error) do_log(Error, __FILE__, __LINE__, fmt, ##__VA_ARGS__) 22 | #define Fatalf(fmt, ...) do_log(Fatal, __FILE__, __LINE__, fmt, ##__VA_ARGS__);abort() 23 | 24 | extern void initLog(); 25 | 26 | }; // namespace libraft 27 | 28 | #endif // __LIBRAFT_LOGGER_H__ 29 | -------------------------------------------------------------------------------- /src/base/mutex.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #include "base/mutex.h" 6 | 7 | namespace libraft { 8 | struct LockerImpl { 9 | LockerImpl() { 10 | } 11 | ~LockerImpl() { 12 | } 13 | int Lock() { 14 | return 0; 15 | } 16 | int UnLock() { 17 | return 0; 18 | } 19 | }; 20 | 21 | Locker::Locker() : impl_(new LockerImpl()) { 22 | } 23 | 24 | Locker::~Locker() { 25 | delete impl_; 26 | } 27 | 28 | int Locker::Lock() { 29 | return impl_->Lock(); 30 | } 31 | 32 | int Locker::UnLock() { 33 | return impl_->UnLock(); 34 | } 35 | }; // namespace libraft -------------------------------------------------------------------------------- /src/base/mutex.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #ifndef __LIBRAFT_MUTEX_H__ 6 | #define __LIBRAFT_MUTEX_H__ 7 | 8 | namespace libraft { 9 | struct LockerImpl; 10 | 11 | class Locker { 12 | public: 13 | Locker(); 14 | ~Locker(); 15 | 16 | int Lock(); 17 | int UnLock(); 18 | private: 19 | LockerImpl *impl_; 20 | }; 21 | 22 | class Mutex { 23 | public: 24 | Mutex(Locker *locker) : locker_(locker) { 25 | locker_->Lock(); 26 | } 27 | ~Mutex() { 28 | locker_->UnLock(); 29 | } 30 | private: 31 | Locker *locker_; 32 | }; 33 | 34 | }; // namespace libraft 35 | 36 | #endif // __LIBRAFT_MUTEX_H__ 37 | -------------------------------------------------------------------------------- /src/base/util.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #include 6 | #include "base/util.h" 7 | 8 | using namespace std; 9 | namespace libraft { 10 | 11 | bool 12 | isDeepEqualSnapshot(const Snapshot *s1, const Snapshot *s2) { 13 | if (s1 == NULL || s2 == NULL) { 14 | return false; 15 | } 16 | 17 | if (s1->metadata().index() != s2->metadata().index()) { 18 | return false; 19 | } 20 | if (s1->metadata().term() != s2->metadata().term()) { 21 | return false; 22 | } 23 | if (s1->data() != s2->data()) { 24 | return false; 25 | } 26 | 27 | return true; 28 | } 29 | 30 | bool 31 | isDeepEqualEntry(const Entry& ent1, const Entry& ent2) { 32 | if (ent1.type() != ent2.type()) { 33 | return false; 34 | } 35 | if (ent1.term() != ent2.term()) { 36 | return false; 37 | } 38 | if (ent1.index() != ent2.index()) { 39 | return false; 40 | } 41 | if (ent1.data() != ent2.data()) { 42 | return false; 43 | } 44 | return true; 45 | } 46 | 47 | bool 48 | isDeepEqualEntries(const EntryVec& ents1, const EntryVec& ents2) { 49 | if (ents1.size() != ents2.size()) { 50 | return false; 51 | } 52 | size_t i; 53 | for (i = 0; i < ents1.size(); ++i) { 54 | if (!isDeepEqualEntry(ents1[i], ents2[i])) { 55 | return false; 56 | } 57 | } 58 | return true; 59 | } 60 | 61 | bool isDeepEqualNodes(const vector& ns1, const vector& ns2) { 62 | if (ns1.size() != ns2.size()) { 63 | return false; 64 | } 65 | size_t i; 66 | for (i = 0; i < ns1.size(); ++i) { 67 | if (ns1[i] != ns2[i]) { 68 | return false; 69 | } 70 | } 71 | return true; 72 | } 73 | 74 | void 75 | limitSize(uint64_t maxSize, EntryVec *entries) { 76 | if (entries->empty()) { 77 | return; 78 | } 79 | 80 | int limit; 81 | int num = entries->size(); 82 | uint64_t size = (*entries)[0].ByteSizeLong(); 83 | for (limit = 1; limit < num; ++limit) { 84 | size += (*entries)[limit].ByteSizeLong(); 85 | if (size > maxSize) { 86 | break; 87 | } 88 | } 89 | 90 | entries->erase(entries->begin() + limit, entries->end()); 91 | } 92 | 93 | bool 94 | isLoaclMessage(const MessageType type) { 95 | return (type == MsgHup || 96 | type == MsgBeat || 97 | type == MsgUnreachable || 98 | type == MsgSnapStatus || 99 | type == MsgCheckQuorum); 100 | } 101 | 102 | bool 103 | isResponseMessage(const MessageType type) { 104 | return (type == MsgAppResp || 105 | type == MsgVoteResp || 106 | type == MsgHeartbeatResp || 107 | type == MsgUnreachable || 108 | type == MsgPreVoteResp); 109 | } 110 | 111 | bool 112 | isHardStateEqual(const HardState& h1, const HardState& h2) { 113 | return h1.term() == h2.term() && 114 | h1.vote() == h2.vote() && 115 | h1.commit() == h2.commit(); 116 | } 117 | 118 | bool 119 | isSoftStateEqual(const SoftState& s1, const SoftState& s2) { 120 | if (s1.leader != s2.leader) { 121 | return false; 122 | } 123 | 124 | return s1.state == s2.state; 125 | } 126 | 127 | bool 128 | isEmptySnapshot(const Snapshot* snapshot) { 129 | if (snapshot == NULL) { 130 | return true; 131 | } 132 | return snapshot->metadata().index() == 0; 133 | } 134 | 135 | bool 136 | isDeepEqualReadStates(const vector& rs1, const vector& rs2) { 137 | if (rs1.size() != rs2.size()) { 138 | return false; 139 | } 140 | size_t i; 141 | for (i = 0; i < rs1.size(); ++i) { 142 | ReadState* r1 = rs1[i]; 143 | ReadState* r2 = rs2[i]; 144 | if (r1->index != r2->index) { 145 | return false; 146 | } 147 | if (r1->requestCtx != r2->requestCtx) { 148 | return false; 149 | } 150 | } 151 | 152 | return true; 153 | } 154 | 155 | bool 156 | isDeepEqualMessage(const Message& msg1, const Message& msg2) { 157 | if (msg1.from() != msg2.from()) { 158 | return false; 159 | } 160 | if (msg1.to() != msg2.to()) { 161 | return false; 162 | } 163 | if (msg1.type() != msg2.type()) { 164 | return false; 165 | } 166 | 167 | if (msg1.entries_size() != msg2.entries_size()) { 168 | return false; 169 | } 170 | 171 | int i; 172 | for (i = 0; i < msg1.entries_size(); ++i) { 173 | if (!isDeepEqualEntry(msg1.entries(i), msg2.entries(i))) { 174 | return false; 175 | } 176 | } 177 | return true; 178 | } 179 | 180 | bool 181 | isDeepEqualRecord(const Record& r1, const Record& r2) { 182 | if (r1.type() != r2.type()) { 183 | printf("type:%lld,%lld\n", r1.type(), r2.type()); 184 | return false; 185 | } 186 | if (r1.crc() != r2.crc()) { 187 | printf("crc\n"); 188 | return false; 189 | } 190 | if (r1.data() != r2.data()) { 191 | printf("data\n"); 192 | return false; 193 | } 194 | 195 | return true; 196 | } 197 | 198 | int 199 | numOfPendingConf(const EntryVec& entries) { 200 | size_t i; 201 | int n = 0; 202 | for (i = 0; i < entries.size(); ++i) { 203 | if (entries[i].type() == EntryConfChange) { 204 | ++n; 205 | } 206 | } 207 | 208 | return n; 209 | } 210 | 211 | MessageType 212 | voteRespMsgType(int t) { 213 | if (t == MsgVote) { 214 | return MsgVoteResp; 215 | } 216 | return MsgPreVoteResp; 217 | } 218 | 219 | string 220 | entryStr(const Entry& entry) { 221 | char tmp[100]; 222 | snprintf(tmp, sizeof(tmp), "(%llu,%llu)", entry.index(), entry.term()); 223 | return string(tmp); 224 | } 225 | 226 | string 227 | entryVecDebugString(const EntryVec& entries) { 228 | string str; 229 | for (size_t i = 0; i < entries.size(); ++i) { 230 | str += entryStr(entries[i]) + ";"; 231 | } 232 | return str; 233 | } 234 | 235 | string 236 | joinStrings(const vector& strs, const string &sep) { 237 | string ret = ""; 238 | size_t i; 239 | for (i = 0; i < strs.size(); ++i) { 240 | if (ret.length() > 0) { 241 | ret += sep; 242 | } 243 | ret += strs[i]; 244 | } 245 | 246 | return ret; 247 | } 248 | 249 | }; // namespace libraft -------------------------------------------------------------------------------- /src/base/util.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #ifndef __LIBRAFT_UTIL_H__ 6 | #define __LIBRAFT_UTIL_H__ 7 | 8 | #include "libraft.h" 9 | #include "proto/record.pb.h" 10 | 11 | using namespace walpb; 12 | 13 | namespace libraft { 14 | 15 | void limitSize(uint64_t maxSize, EntryVec *entries); 16 | 17 | bool isDeepEqualNodes(const vector& ns1, const vector& ns2); 18 | bool isDeepEqualSnapshot(const Snapshot *s1, const Snapshot *s2); 19 | bool isDeepEqualEntries(const EntryVec& ents1, const EntryVec& ents2); 20 | bool isDeepEqualReadStates(const vector& rs1, const vector& rs2); 21 | bool isDeepEqualMessage(const Message& msg1, const Message& msg2); 22 | bool isDeepEqualRecord(const Record& r1, const Record& r2); 23 | bool isHardStateEqual(const HardState& h1, const HardState& h2); 24 | bool isSoftStateEqual(const SoftState& s1, const SoftState& s2); 25 | bool isEmptySnapshot(const Snapshot* snapshot); 26 | int numOfPendingConf(const EntryVec& entries); 27 | MessageType voteRespMsgType(int t); 28 | 29 | bool isLoaclMessage(const MessageType type); 30 | bool isResponseMessage(const MessageType type); 31 | 32 | string entryStr(const Entry& entry); 33 | string entryVecDebugString(const EntryVec& entries); 34 | 35 | // string util 36 | string joinStrings(const vector& strs, const string &sep); 37 | 38 | // IsEmptySnap returns true if the given Snapshot is empty. 39 | inline static bool 40 | isEmptyHardState(const HardState& hs) { 41 | return isHardStateEqual(hs, kEmptyHardState); 42 | } 43 | 44 | inline static bool 45 | isEmptySoftState(const SoftState& ss) { 46 | return isSoftStateEqual(ss, kEmptySoftState); 47 | } 48 | }; // namespace libraft 49 | 50 | #endif // __LIBRAFT_UTIL_H__ 51 | -------------------------------------------------------------------------------- /src/core/fsm_caller.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #include "libraft.h" 6 | #include "base/util.h" 7 | #include "core/fsm_caller.h" 8 | #include "core/node.h" 9 | 10 | namespace libraft { 11 | 12 | FsmCaller::FsmCaller(StateMachine* fsm, Node* node) 13 | : fsm_(fsm), 14 | node_(node) { 15 | 16 | } 17 | 18 | void 19 | FsmCaller::on_call_fsm(Ready* ready) { 20 | if (fsm_ == NULL) { // fsm == NULL means in test mode 21 | (void)node_; 22 | //node_->Advance(); 23 | return; 24 | } 25 | if (!isEmptySoftState(ready->softState)) { 26 | fsm_->on_soft_state_changed(ready->softState); 27 | } 28 | } 29 | 30 | }; // namespace libraft -------------------------------------------------------------------------------- /src/core/fsm_caller.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #ifndef __LIBRAFT_FSM_CALLER_H__ 6 | #define __LIBRAFT_FSM_CALLER_H__ 7 | 8 | namespace libraft { 9 | 10 | class Node; 11 | class StateMachine; 12 | struct Ready; 13 | 14 | class FsmCaller { 15 | public: 16 | FsmCaller(StateMachine*, Node*); 17 | 18 | void on_call_fsm(Ready* ready); 19 | 20 | private: 21 | StateMachine* fsm_; 22 | Node* node_; 23 | }; 24 | 25 | }; // namespace libraft 26 | 27 | #endif // __LIBRAFT_FSM_CALLER_H__ 28 | -------------------------------------------------------------------------------- /src/core/node.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #include 6 | #include "base/logger.h" 7 | #include "base/util.h" 8 | #include "core/node.h" 9 | #include "core/raft.h" 10 | 11 | namespace libraft { 12 | 13 | NodeImpl::NodeImpl(raft* r, Config* config) 14 | : Node() 15 | , stopped_(false) 16 | , raft_(r) 17 | , leader_(kEmptyPeerId) 18 | , prevSoftState_(kEmptySoftState) 19 | , prevHardState_(kEmptyHardState) 20 | , waitAdvanced_(false) 21 | , fsm_caller_(config->fsm, this) 22 | , canPropose_(true) 23 | , msgType_(NoneMessage) 24 | , prevLastUnstableIndex_(0) 25 | , prevLastUnstableTerm_(0) 26 | , havePrevLastUnstableIndex_(false) 27 | , prevSnapshotIndex_(0) 28 | , confState_(NULL) { 29 | // init prev softState 30 | r->softState(&prevSoftState_); 31 | } 32 | 33 | NodeImpl::~NodeImpl() { 34 | delete raft_; 35 | } 36 | 37 | void 38 | NodeImpl::Tick() { 39 | msgType_ = TickMessage; 40 | stateMachine(Message()); 41 | } 42 | 43 | int 44 | NodeImpl::Campaign() { 45 | Message msg; 46 | msg.set_type(MsgHup); 47 | return doStep(msg); 48 | } 49 | 50 | int 51 | NodeImpl::Propose(const string& data) { 52 | Message msg; 53 | msg.set_type(MsgProp); 54 | msg.set_from(raft_->id_); 55 | msg.add_entries()->set_data(data); 56 | return doStep(msg); 57 | } 58 | 59 | int 60 | NodeImpl::ProposeConfChange(const ConfChange& cc) { 61 | string data; 62 | if (!cc.SerializeToString(&data)) { 63 | Errorf("ConfChange SerializeToString error"); 64 | return ErrSerializeFail; 65 | } 66 | 67 | Message msg; 68 | msg.set_type(MsgProp); 69 | msg.set_from(raft_->id_); 70 | Entry *entry = msg.add_entries(); 71 | entry->set_type(EntryConfChange); 72 | entry->set_data(data); 73 | 74 | return Step(msg); 75 | } 76 | 77 | int 78 | NodeImpl::Step(const Message& msg) { 79 | // ignore unexpected local messages receiving over network 80 | if (isLoaclMessage(msg.type())) { 81 | return OK; 82 | } 83 | 84 | return doStep(msg); 85 | } 86 | 87 | void 88 | NodeImpl::Advance() { 89 | if (prevHardState_.commit() != 0) { 90 | raft_->raftLog_->appliedTo(prevHardState_.commit()); 91 | } 92 | if (havePrevLastUnstableIndex_) { 93 | raft_->raftLog_->stableTo(prevLastUnstableIndex_, prevLastUnstableTerm_); 94 | havePrevLastUnstableIndex_ = false; 95 | } 96 | raft_->raftLog_->stableSnapTo(prevSnapshotIndex_); 97 | size_t i; 98 | for (i = 0; i < ready_.messages.size(); ++i) { 99 | delete ready_.messages[i]; 100 | } 101 | ready_.messages.clear(); 102 | for (i = 0; i < ready_.readStates.size(); ++i) { 103 | delete ready_.readStates[i]; 104 | } 105 | ready_.readStates.clear(); 106 | waitAdvanced_ = false; 107 | } 108 | 109 | void 110 | NodeImpl::ApplyConfChange(const ConfChange& cc, ConfState *cs) { 111 | confChange_ = cc; 112 | confState_ = cs; 113 | /* 114 | msgType_ = ConfChangeMessage; 115 | stateMachine(Message(), ready); 116 | */ 117 | handleConfChange(); 118 | } 119 | 120 | int 121 | NodeImpl::doStep(const Message& msg) { 122 | if (msg.type() == MsgProp) { 123 | msgType_ = ProposeMessage; 124 | } else { 125 | msgType_ = RecvMessage; 126 | } 127 | 128 | return stateMachine(msg); 129 | } 130 | 131 | void 132 | NodeImpl::TransferLeadership(uint64_t leader, uint64_t transferee) { 133 | msgType_ = RecvMessage; 134 | Message msg; 135 | msg.set_type(MsgTransferLeader); 136 | msg.set_from(transferee); 137 | msg.set_to(leader); 138 | 139 | stateMachine(msg); 140 | } 141 | 142 | int NodeImpl::ReadIndex(const string &rctx) { 143 | Message msg; 144 | msg.set_type(MsgReadIndex); 145 | msg.add_entries()->set_data(rctx); 146 | 147 | return doStep(msg); 148 | } 149 | 150 | int 151 | NodeImpl::stateMachine(const Message& msg) { 152 | if (stopped_) { 153 | return OK; 154 | } 155 | if (leader_ != raft_->leader_) { 156 | if (raft_->hasLeader()) { 157 | if (leader_ == kEmptyPeerId) { 158 | Infof("raft.node: %x elected leader %x at term %llu", 159 | raft_->id_, raft_->leader_, raft_->term_); 160 | } else { 161 | Infof("raft.node: %x changed leader from %x to %x at term %llu", 162 | raft_->id_, leader_, raft_->leader_, raft_->term_); 163 | } 164 | canPropose_ = true; 165 | } else { 166 | canPropose_ = false; 167 | Infof("raft.node: %x lost leader %x at term %llu", 168 | raft_->id_, leader_, raft_->term_); 169 | } 170 | leader_ = raft_->leader_; 171 | } 172 | 173 | int ret = OK; 174 | switch (msgType_) { 175 | case ProposeMessage: 176 | if (canPropose_) { 177 | raft_->step(msg); 178 | } 179 | break; 180 | case RecvMessage: 181 | // filter out response message from unknown From. 182 | if (isMessageFromClusterNode(msg) || !isResponseMessage(msg.type())) { 183 | raft_->step(msg); 184 | } 185 | break; 186 | case TickMessage: 187 | raft_->tick(); 188 | break; 189 | case ConfChangeMessage: 190 | handleConfChange(); 191 | break; 192 | default: 193 | break; 194 | } 195 | 196 | if (!waitAdvanced_) { 197 | newReady(); 198 | if (readyContainUpdate()) { 199 | waitAdvanced_ = true; 200 | fsm_caller_.on_call_fsm(&ready_); 201 | } 202 | } 203 | 204 | reset(); 205 | return ret; 206 | } 207 | 208 | void 209 | NodeImpl::handleConfChange() { 210 | if (confChange_.nodeid() == kEmptyPeerId) { 211 | raft_->resetPendingConf(); 212 | goto addnodes; 213 | } 214 | 215 | switch(confChange_.type()) { 216 | case ConfChangeAddNode: 217 | raft_->addNode(confChange_.nodeid()); 218 | break; 219 | case ConfChangeRemoveNode: 220 | // block incoming proposal when local node is removed 221 | if (confChange_.nodeid() == raft_->id_) { 222 | canPropose_ = false; 223 | } 224 | raft_->removeNode(confChange_.nodeid()); 225 | break; 226 | case ConfChangeUpdateNode: 227 | raft_->resetPendingConf(); 228 | break; 229 | default: 230 | Fatalf("unexpected conf type"); 231 | break; 232 | } 233 | 234 | addnodes: 235 | vector nodes; 236 | raft_->nodes(&nodes); 237 | size_t j; 238 | for (j = 0; j < nodes.size(); ++j) { 239 | confState_->add_nodes(nodes[j]); 240 | } 241 | } 242 | 243 | void 244 | NodeImpl::reset() { 245 | msgType_ = NoneMessage; 246 | confState_ = NULL; 247 | } 248 | 249 | bool 250 | NodeImpl::isMessageFromClusterNode(const Message& msg) { 251 | return (raft_->progressMap_.find(msg.from()) != raft_->progressMap_.end()); 252 | } 253 | 254 | void 255 | NodeImpl::newReady() { 256 | // 1) reset ready data 257 | ready_.softState = kEmptySoftState; 258 | ready_.hardState = kEmptyHardState; 259 | ready_.snapshot = NULL; 260 | ready_.readStates.clear(); 261 | ready_.entries.clear(); 262 | ready_.committedEntries.clear(); 263 | ready_.messages.clear(); 264 | 265 | // 2) return the new ready state data in ready 266 | raft_->raftLog_->unstableEntries(&ready_.entries); 267 | raft_->raftLog_->nextEntries(&ready_.committedEntries); 268 | ready_.messages = raft_->outMsgs_; 269 | 270 | SoftState ss; 271 | raft_->softState(&ss); 272 | if (!isSoftStateEqual(ss, prevSoftState_)) { 273 | ready_.softState = ss; 274 | } 275 | 276 | HardState hs; 277 | raft_->hardState(&hs); 278 | if (!isHardStateEqual(hs, prevHardState_)) { 279 | ready_.hardState = hs; 280 | } 281 | 282 | if (raft_->raftLog_->unstable_.snapshot_ != NULL) { 283 | ready_.snapshot = raft_->raftLog_->unstable_.snapshot_; 284 | } 285 | 286 | if (!raft_->readStates_.empty()) { 287 | ready_.readStates = raft_->readStates_; 288 | } 289 | 290 | // 3) save the state data 291 | prevSoftState_ = ready_.softState; 292 | size_t entSize = ready_.entries.size(); 293 | if (entSize > 0) { 294 | prevLastUnstableIndex_ = ready_.entries[entSize - 1].index(); 295 | prevLastUnstableTerm_ = ready_.entries[entSize - 1].term(); 296 | havePrevLastUnstableIndex_ = true; 297 | } 298 | if (!isEmptyHardState(ready_.hardState)) { 299 | prevHardState_ = ready_.hardState; 300 | } 301 | if (!isEmptySnapshot(ready_.snapshot)) { 302 | prevSnapshotIndex_ = ready_.snapshot->metadata().index(); 303 | } 304 | 305 | raft_->outMsgs_.clear(); 306 | raft_->readStates_.clear(); 307 | } 308 | 309 | void 310 | NodeImpl::Stop() { 311 | stopped_ = true; 312 | } 313 | 314 | bool 315 | NodeImpl::readyContainUpdate() { 316 | return (!isEmptySoftState(ready_.softState) || 317 | !isEmptyHardState(ready_.hardState) || 318 | !isEmptySnapshot(ready_.snapshot) || 319 | !ready_.entries.empty() || 320 | !ready_.committedEntries.empty() || 321 | !ready_.messages.empty() || 322 | !ready_.readStates.empty()); 323 | } 324 | 325 | // StartNode returns a new Node given configuration and a list of raft peers. 326 | // It appends a ConfChangeAddNode entry for each given peer to the initial log. 327 | Node* 328 | StartNode(Config* config, const vector& peers) { 329 | raft *r = newRaft(config); 330 | 331 | if (r == NULL) { 332 | return NULL; 333 | } 334 | // become the follower at term 1 and apply initial configuration 335 | // entries of term 1 336 | r->becomeFollower(1, kEmptyPeerId); 337 | 338 | size_t i; 339 | for (i = 0; i < peers.size(); ++i) { 340 | const Peer& peer = peers[i]; 341 | 342 | ConfChange cc; 343 | cc.set_type(ConfChangeAddNode); 344 | cc.set_nodeid(peer.Id); 345 | cc.set_context(peer.Context); 346 | string str; 347 | cc.SerializeToString(&str); 348 | 349 | Entry entry; 350 | EntryVec entries; 351 | entry.set_type(EntryConfChange); 352 | entry.set_term(1); 353 | entry.set_index(r->raftLog_->lastIndex() + 1); 354 | entry.set_data(str); 355 | entries.push_back(entry); 356 | 357 | r->raftLog_->append(entries); 358 | } 359 | 360 | // Mark these initial entries as committed. 361 | // TODO(bdarnell): These entries are still unstable; do we need to preserve 362 | // the invariant that committed < unstable? 363 | r->raftLog_->committed_ = r->raftLog_->lastIndex(); 364 | 365 | // Now apply them, mainly so that the application can call Campaign 366 | // immediately after StartNode in tests. Note that these nodes will 367 | // be added to raft twice: here and when the application's Ready 368 | // loop calls ApplyConfChange. The calls to addNode must come after 369 | // all calls to raftLog.append so progress.next is set after these 370 | // bootstrapping entries (it is an error if we try to append these 371 | // entries since they have already been committed). 372 | // We do not set raftLog.applied so the application will be able 373 | // to observe all conf changes via Ready.CommittedEntries. 374 | for (i = 0; i < peers.size(); ++i) { 375 | const Peer& peer = peers[i]; 376 | r->addNode(peer.Id); 377 | } 378 | 379 | return new NodeImpl(r,config); 380 | } 381 | 382 | // RestartNode is similar to StartNode but does not take a list of peers. 383 | // The current membership of the cluster will be restored from the Storage. 384 | // If the caller has an existing state machine, pass in the last log index that 385 | // has been applied to it; otherwise use zero. 386 | Node* 387 | RestartNode(Config *config) { 388 | raft *r = newRaft(config); 389 | if (r == NULL) { 390 | return NULL; 391 | } 392 | 393 | return new NodeImpl(r,config); 394 | } 395 | 396 | }; // namespace libraft -------------------------------------------------------------------------------- /src/core/node.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #ifndef __LIBRAFT_NODE_H__ 6 | #define __LIBRAFT_NODE_H__ 7 | 8 | #include "libraft.h" 9 | #include "core/fsm_caller.h" 10 | 11 | namespace libraft { 12 | 13 | enum NodeMessageType { 14 | ProposeMessage = 0, 15 | RecvMessage = 1, 16 | ConfChangeMessage = 2, 17 | TickMessage = 3, 18 | ReadyMessage = 4, 19 | NoneMessage = 5 20 | }; 21 | 22 | struct raft; 23 | 24 | struct Ready { 25 | // The current volatile state of a Node. 26 | // SoftState will be nil if there is no update. 27 | // It is not required to consume or store SoftState. 28 | SoftState softState; 29 | 30 | // The current state of a Node to be saved to stable storage BEFORE 31 | // Messages are sent. 32 | // HardState will be equal to empty state if there is no update. 33 | HardState hardState; 34 | 35 | // ReadStates can be used for node to serve linearizable read requests locally 36 | // when its applied index is greater than the index in ReadState. 37 | // Note that the readState will be returned when raft receives msgReadIndex. 38 | // The returned is only valid for the request that requested to read. 39 | vector readStates; 40 | 41 | // Entries specifies entries to be saved to stable storage BEFORE 42 | // Messages are sent. 43 | EntryVec entries; 44 | 45 | // Snapshot specifies the snapshot to be saved to stable storage. 46 | Snapshot *snapshot; 47 | 48 | // CommittedEntries specifies entries to be committed to a 49 | // store/state-machine. These have previously been committed to stable 50 | // store. 51 | EntryVec committedEntries; 52 | 53 | // Messages specifies outbound messages to be sent AFTER Entries are 54 | // committed to stable storage. 55 | // If it contains a MsgSnap message, the application MUST report back to raft 56 | // when the snapshot has been received or has failed by calling ReportSnapshot. 57 | MessageVec messages; 58 | }; 59 | 60 | class NodeImpl : public Node { 61 | public: 62 | NodeImpl(raft*, Config*); 63 | virtual ~NodeImpl(); 64 | 65 | virtual void Tick(); 66 | virtual int Campaign(); 67 | virtual int Propose(const string& data); 68 | virtual int ProposeConfChange(const ConfChange& cc); 69 | virtual int Step(const Message& msg); 70 | virtual void Advance(); 71 | virtual void ApplyConfChange(const ConfChange& cc, ConfState *cs); 72 | virtual void TransferLeadership(uint64_t leader, uint64_t transferee); 73 | virtual int ReadIndex(const string &rctx); 74 | virtual void Stop(); 75 | 76 | Ready* get_ready() { 77 | return &ready_; 78 | } 79 | 80 | private: 81 | int stateMachine(const Message& msg); 82 | void newReady(); 83 | int doStep(const Message& msg); 84 | bool isMessageFromClusterNode(const Message& msg); 85 | void handleConfChange(); 86 | void handleAdvance(); 87 | void reset(); 88 | bool readyContainUpdate(); 89 | 90 | public: 91 | bool stopped_; 92 | 93 | // the Raft state machine 94 | raft *raft_; 95 | 96 | // save previous the state 97 | uint64_t leader_; 98 | SoftState prevSoftState_; 99 | HardState prevHardState_; 100 | bool waitAdvanced_; 101 | 102 | // save Ready data in each step 103 | Ready ready_; 104 | 105 | FsmCaller fsm_caller_; 106 | 107 | // if there is no leader, then cannot propose any msg 108 | bool canPropose_; 109 | 110 | // save state machine msg type 111 | NodeMessageType msgType_; 112 | 113 | // save previous storage data, in `Advance' func, use these datas to update storage 114 | uint64_t prevLastUnstableIndex_; 115 | uint64_t prevLastUnstableTerm_; 116 | bool havePrevLastUnstableIndex_; 117 | uint64_t prevSnapshotIndex_; 118 | 119 | // for ApplyConfChange 120 | ConfChange confChange_; 121 | ConfState* confState_; 122 | }; 123 | 124 | }; // namespace libraft 125 | 126 | #endif // __LIBRAFT_NODE_H__ 127 | -------------------------------------------------------------------------------- /src/core/progress.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #include "base/logger.h" 6 | #include "core/progress.h" 7 | 8 | namespace libraft { 9 | 10 | Progress::Progress(uint64_t next, int maxInfilght) 11 | : match_(0), 12 | next_(next), 13 | state_(ProgressStateProbe), 14 | paused_(false), 15 | pendingSnapshot_(0), 16 | recentActive_(false), 17 | inflights_(inflights(maxInfilght)) { 18 | } 19 | 20 | Progress::~Progress() { 21 | } 22 | 23 | void 24 | Progress::resetState(ProgressState state) { 25 | paused_ = false; 26 | pendingSnapshot_ = 0; 27 | state_ = state; 28 | inflights_.reset(); 29 | } 30 | 31 | void 32 | Progress::becomeProbe() { 33 | // If the original state is ProgressStateSnapshot, progress knows that 34 | // the pending snapshot has been sent to this peer successfully, then 35 | // probes from pendingSnapshot + 1. 36 | if (state_ == ProgressStateSnapshot) { 37 | uint64_t pendingSnapshot = pendingSnapshot_; 38 | resetState(ProgressStateProbe); 39 | next_ = max(match_ + 1, pendingSnapshot + 1); 40 | } else { 41 | resetState(ProgressStateProbe); 42 | next_ = match_ + 1; 43 | } 44 | } 45 | 46 | void 47 | Progress::becomeReplicate() { 48 | resetState(ProgressStateReplicate); 49 | next_ = match_ + 1; 50 | } 51 | 52 | void 53 | Progress::becomeSnapshot(uint64_t snapshoti) { 54 | resetState(ProgressStateSnapshot); 55 | pendingSnapshot_ = snapshoti; 56 | } 57 | 58 | // maybeUpdate returns false if the given n index comes from an outdated message. 59 | // Otherwise it updates the progress and returns true. 60 | bool 61 | Progress::maybeUpdate(uint64_t n) { 62 | bool updated = false; 63 | if (match_ < n) { 64 | match_ = n; 65 | updated = true; 66 | resume(); 67 | } 68 | if (next_ < n + 1) { 69 | next_ = n + 1; 70 | } 71 | return updated; 72 | } 73 | 74 | void 75 | Progress::optimisticUpdate(uint64_t n) { 76 | next_ = n + 1; 77 | } 78 | 79 | void 80 | Progress::snapshotFailure() { 81 | pendingSnapshot_ = 0; 82 | } 83 | 84 | // maybeDecrTo returns false if the given to index comes from an out of order message. 85 | // Otherwise it decreases the progress next index to min(rejected, last) and returns true. 86 | bool 87 | Progress::maybeDecrTo(uint64_t rejected, uint64_t last) { 88 | if (state_ == ProgressStateReplicate) { 89 | // the rejection must be stale if the progress has matched and "rejected" 90 | // is smaller than "match". 91 | if (rejected <= match_) { 92 | return false; 93 | } 94 | // directly decrease next to match + 1 95 | next_ = match_ + 1; 96 | return true; 97 | } 98 | 99 | // the rejection must be stale if "rejected" does not match next - 1 100 | if (next_ - 1 != rejected) { 101 | return false; 102 | } 103 | 104 | next_ = min(rejected, last + 1); 105 | if (next_ < 1) { 106 | next_ = 1; 107 | } 108 | resume(); 109 | return true; 110 | } 111 | 112 | void 113 | Progress::pause() { 114 | paused_ = true; 115 | } 116 | 117 | void 118 | Progress::resume() { 119 | paused_ = false; 120 | } 121 | 122 | const char* Progress::stateString() { 123 | if (state_ == ProgressStateProbe) { 124 | return "ProgressStateProbe"; 125 | } 126 | if (state_ == ProgressStateSnapshot) { 127 | return "ProgressStateSnapshot"; 128 | } 129 | if (state_ == ProgressStateReplicate) { 130 | return "ProgressStateReplicate"; 131 | } 132 | 133 | return "unknown state"; 134 | } 135 | 136 | // IsPaused returns whether sending log entries to this node has been 137 | // paused. A node may be paused because it has rejected recent 138 | // MsgApps, is currently waiting for a snapshot, or has reached the 139 | // MaxInflightMsgs limit. 140 | bool 141 | Progress::isPaused() { 142 | switch (state_) { 143 | case ProgressStateProbe: 144 | return paused_; 145 | case ProgressStateReplicate: 146 | return inflights_.full(); 147 | case ProgressStateSnapshot: 148 | return true; 149 | } 150 | } 151 | 152 | // needSnapshotAbort returns true if snapshot progress's Match 153 | // is equal or higher than the pendingSnapshot. 154 | bool 155 | Progress::needSnapshotAbort() { 156 | return state_ == ProgressStateSnapshot && match_ >= pendingSnapshot_; 157 | } 158 | 159 | string 160 | Progress::String() { 161 | char tmp[500]; 162 | snprintf(tmp, sizeof(tmp), "next = %llu, match = %llu, state = %s, waiting = %d, pendingSnapshot = %llu", 163 | next_, match_, stateString(), isPaused(), pendingSnapshot_); 164 | return std::string(tmp); 165 | } 166 | 167 | void 168 | inflights::add(uint64_t infight) { 169 | if (full()) { 170 | Fatalf("cannot add into a full inflights"); 171 | } 172 | 173 | uint64_t next = start_ + count_; 174 | uint64_t size = size_; 175 | 176 | if (next >= size) { 177 | next -= size; 178 | } 179 | 180 | if (next >= buffer_.size()) { 181 | growBuf(); 182 | } 183 | buffer_[next] = infight; 184 | count_++; 185 | } 186 | 187 | // grow the inflight buffer by doubling up to inflights.size. We grow on demand 188 | // instead of preallocating to inflights.size to handle systems which have 189 | // thousands of Raft groups per process. 190 | void 191 | inflights::growBuf() { 192 | uint32_t newSize = buffer_.size() * 2; 193 | if (newSize == 0) { 194 | newSize = 1; 195 | } else if (newSize > size_) { 196 | newSize = size_; 197 | } 198 | 199 | buffer_.resize(newSize); 200 | } 201 | 202 | // freeTo frees the inflights smaller or equal to the given `to` flight. 203 | void 204 | inflights::freeTo(uint64_t to) { 205 | if (count_ == 0 || to < buffer_[start_]) { 206 | return; 207 | } 208 | 209 | uint64_t i = 0, idx = start_; 210 | for (i = 0; i < count_; ++i) { 211 | if (to < buffer_[idx]) { // found the first large inflight 212 | break; 213 | } 214 | 215 | // increase index and maybe rotate 216 | uint64_t size = size_; 217 | ++idx; 218 | if (idx >= size) { 219 | idx -= size; 220 | } 221 | } 222 | 223 | // free i inflights and set new start index 224 | count_ -= i; 225 | start_ = idx; 226 | if (count_ == 0) { 227 | // inflights is empty, reset the start index so that we don't grow the 228 | // buffer unnecessarily. 229 | start_ = 0; 230 | } 231 | } 232 | 233 | void 234 | inflights::freeFirstOne() { 235 | freeTo(buffer_[start_]); 236 | } 237 | 238 | bool 239 | inflights::full() { 240 | return count_ == size_; 241 | } 242 | 243 | void 244 | inflights::reset() { 245 | count_ = 0; 246 | start_ = 0; 247 | } 248 | 249 | }; // namespace libraft -------------------------------------------------------------------------------- /src/core/progress.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #ifndef __LIBRAFT_PROGRESS_H__ 6 | #define __LIBRAFT_PROGRESS_H__ 7 | 8 | #include "libraft.h" 9 | 10 | namespace libraft { 11 | 12 | // inflights is a sliding window for the inflight messages. 13 | struct inflights { 14 | // the starting index in the buffer 15 | int start_; 16 | 17 | // number of inflights in the buffer 18 | uint32_t count_; 19 | 20 | // the size of the buffer 21 | uint32_t size_; 22 | 23 | // buffer contains the index of the last entry 24 | // inside one message. 25 | vector buffer_; 26 | 27 | void add(uint64_t infight); 28 | void growBuf(); 29 | void freeTo(uint64_t to); 30 | void freeFirstOne(); 31 | bool full(); 32 | void reset(); 33 | 34 | inflights(int size) 35 | : start_(0), 36 | count_(0), 37 | size_(size) { 38 | buffer_.resize(size); 39 | } 40 | ~inflights() { 41 | } 42 | }; 43 | 44 | // State defines how the leader should interact with the follower. 45 | enum ProgressState { 46 | // When in ProgressStateProbe, leader sends at most one replication message 47 | // per heartbeat interval. It also probes actual progress of the follower. 48 | ProgressStateProbe = 0, 49 | 50 | // When in ProgressStateReplicate, leader optimistically increases next 51 | // to the latest entry sent after sending replication message. This is 52 | // an optimized state for fast replicating log entries to the follower. 53 | ProgressStateReplicate = 1, 54 | 55 | // When in ProgressStateSnapshot, leader should have sent out snapshot 56 | // before and stops sending any replication message. 57 | ProgressStateSnapshot = 2 58 | }; 59 | 60 | // Progress represents a follower’s progress in the view of the leader. Leader maintains 61 | // progresses of all followers, and sends entries to the follower based on its progress. 62 | struct Progress { 63 | // for each follower,match_ is the highest log entry known to be replicated. 64 | // (initialized to 0, increases monotonically) 65 | uint64_t match_; 66 | 67 | // for each follower, next_ is the next log entry known to be replicated. 68 | // (initialized to leader last log index + 1) 69 | uint64_t next_; 70 | 71 | // State defines how the leader should interact with the follower. 72 | ProgressState state_; 73 | 74 | // Paused is used in ProgressStateProbe. 75 | // When Paused is true, raft should pause sending replication message to this peer. 76 | bool paused_; 77 | 78 | // PendingSnapshot is used in ProgressStateSnapshot. 79 | // If there is a pending snapshot, the pendingSnapshot will be set to the 80 | // index of the snapshot. If pendingSnapshot is set, the replication process of 81 | // this Progress will be paused. raft will not resend snapshot until the pending one 82 | // is reported to be failed. 83 | uint64_t pendingSnapshot_; 84 | 85 | // RecentActive is true if the progress is recently active. Receiving any messages 86 | // from the corresponding follower indicates the progress is active. 87 | // RecentActive can be reset to false after an election timeout. 88 | bool recentActive_; 89 | 90 | // inflights is a sliding window for the inflight messages. 91 | // Each inflight message contains one or more log entries. 92 | // The max number of entries per message is defined in raft config as MaxSizePerMsg. 93 | // Thus inflight effectively limits both the number of inflight messages 94 | // and the bandwidth each Progress can use. 95 | // When inflights is full, no more message should be sent. 96 | // When a leader sends out a message, the index of the last 97 | // entry should be added to inflights. The index MUST be added 98 | // into inflights in order. 99 | // When a leader receives a reply, the previous inflights should 100 | // be freed by calling inflights.freeTo with the index of the last 101 | // received entry. 102 | inflights inflights_; 103 | 104 | const char* stateString(); 105 | 106 | // reset progress state and sliding window 107 | void resetState(ProgressState state); 108 | 109 | void becomeProbe(); 110 | void becomeReplicate(); 111 | void becomeSnapshot(uint64_t snapshoti); 112 | bool maybeUpdate(uint64_t n); 113 | void optimisticUpdate(uint64_t n); 114 | bool maybeDecrTo(uint64_t rejected, uint64_t last); 115 | void snapshotFailure(); 116 | void pause(); 117 | void resume(); 118 | bool isPaused(); 119 | bool needSnapshotAbort(); 120 | string String(); 121 | 122 | Progress(uint64_t next, int maxInfilght); 123 | ~Progress(); 124 | }; 125 | 126 | }; // namespace libraft 127 | 128 | #endif // __LIBRAFT_PROGRESS_H__ 129 | -------------------------------------------------------------------------------- /src/core/raft.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #ifndef __LIBRAFT_RAFT_H__ 6 | #define __LIBRAFT_RAFT_H__ 7 | 8 | #include 9 | #include "libraft.h" 10 | #include "core/progress.h" 11 | #include "storage/log.h" 12 | 13 | using namespace std; 14 | 15 | namespace libraft { 16 | 17 | struct readOnly; 18 | struct ReadState; 19 | 20 | enum CampaignType { 21 | // CampaignPreElection represents the first phase of a normal election when 22 | // Config.PreVote is true. 23 | CampaignPreElection = 0, 24 | // CampaignElection represents a normal (time-based) election (the second phase 25 | // of the election when Config.PreVote is true). 26 | CampaignElection = 1, 27 | // CampaignTransfer represents the type of leader transfer 28 | CampaignTransfer = 2 29 | }; 30 | 31 | struct raft; 32 | 33 | typedef void (*stepFun)(raft *, const Message&); 34 | 35 | // the Raft State Machine 36 | struct raft { 37 | uint64_t id_; 38 | uint64_t term_; 39 | uint64_t vote_; 40 | 41 | vector readStates_; 42 | raftLog *raftLog_; 43 | int maxInfilght_; 44 | uint64_t maxMsgSize_; 45 | 46 | // cluster node Progress Map 47 | map progressMap_; 48 | 49 | StateType state_; 50 | map votes_; 51 | 52 | // save every outbound msg in outMsgs_,then msgs will be moved to `Ready' struct 53 | MessageVec outMsgs_; 54 | 55 | // current leader id, default is kEmptyPeerId. 56 | uint64_t leader_; 57 | 58 | // leadTransferee is id of the leader transfer target when its value is not zero. 59 | // Follow the procedure defined in raft thesis 3.10. 60 | // default is kEmptyPeerId. 61 | uint64_t leadTransferee_; 62 | 63 | // New configuration is ignored if there exists unapplied configuration. 64 | bool pendingConf_; 65 | readOnly* readOnly_; 66 | 67 | // number of ticks since it reached last electionTimeout when it is leader 68 | // or candidate. 69 | // number of ticks since it reached last electionTimeout or received a 70 | // valid message from current leader when it is a follower. 71 | int electionElapsed_; 72 | 73 | // number of ticks since it reached last heartbeatTimeout. 74 | // only leader keeps heartbeatElapsed. 75 | int heartbeatElapsed_; 76 | 77 | // number of ticks timeout to send heartbeat 78 | int heartbeatTimeout_; 79 | 80 | // number of ticks timeout to election 81 | int electionTimeout_; 82 | 83 | bool checkQuorum_; 84 | bool preVote_; 85 | 86 | // randomizedElectionTimeout is a random number between 87 | // [electiontimeout, 2 * electiontimeout - 1]. It gets reset 88 | // when raft changes its state to follower or candidate. 89 | int randomizedElectionTimeout_; 90 | 91 | // current role state machine function 92 | stepFun stateStepFunc_; 93 | 94 | raft(const Config *, raftLog *); 95 | 96 | ~raft(); 97 | 98 | // called by Node in each `Tick' 99 | void tick(); 100 | 101 | // load HardState in hs 102 | void loadState(const HardState &hs); 103 | 104 | // return current cluster node id in nodes 105 | void nodes(vector *nodes); 106 | 107 | // return true if leader_ is not none 108 | bool hasLeader(); 109 | 110 | // return SoftState in ss 111 | void softState(SoftState *ss); 112 | 113 | // return HardState in hs 114 | void hardState(HardState *hs); 115 | 116 | // return current cluster quorum 117 | int quorum(); 118 | 119 | // send out messages 120 | void send(Message *msg); 121 | 122 | // send append message 123 | void sendAppend(uint64_t to); 124 | 125 | // send heartbeat message 126 | void sendHeartbeat(uint64_t to, const string &ctx); 127 | 128 | // broadcast append message to cluster 129 | void bcastAppend(); 130 | 131 | // broadcast heartbeat message to cluster 132 | void bcastHeartbeat(); 133 | void bcastHeartbeatWithCtx(const string &ctx); 134 | 135 | // change to follower state 136 | void becomeFollower(uint64_t term, uint64_t leader); 137 | 138 | // change to candidate state 139 | void becomeCandidate(); 140 | 141 | // change to pre-candidate state 142 | void becomePreCandidate(); 143 | 144 | // change to leader state 145 | void becomeLeader(); 146 | 147 | // campaign for the new leader 148 | void campaign(CampaignType t); 149 | 150 | // maybeCommit attempts to advance the commit index. Returns true if 151 | // the commit index changed (in which case the caller should 152 | // call r.bcastAppend). 153 | bool maybeCommit(); 154 | 155 | // reset to term 156 | void reset(uint64_t term); 157 | 158 | // append entries to storage 159 | void appendEntry(EntryVec* entries); 160 | 161 | // handle append entries message 162 | void handleAppendEntries(const Message& msg); 163 | 164 | // handle heartbeat message 165 | void handleHeartbeat(const Message& msg); 166 | 167 | // handle snapshot message 168 | void handleSnapshot(const Message& msg); 169 | 170 | // tickElection is run by followers and candidates after r.electionTimeout. 171 | void tickElection(); 172 | 173 | // tickHeartbeat is run by leaders to send a MsgBeat after r.heartbeatTimeout. 174 | void tickHeartbeat(); 175 | 176 | // v means peer `id' accepted or not,after it return num of 177 | // granted peers in cluster 178 | int poll(uint64_t id, MessageType t, bool v); 179 | 180 | // raft state machine main routine,handle a message from node 181 | int step(const Message& msg); 182 | 183 | // promotable indicates whether state machine can be promoted to leader, 184 | // which is true when its own id is in progress list. 185 | bool promotable(); 186 | 187 | // restore recovers the state machine from a snapshot. It restores the log and the 188 | // configuration of state machine. 189 | // return false if snapshot's [index,term] match log's [index,term] 190 | bool restore(const Snapshot& snapshot); 191 | 192 | void delProgress(uint64_t id); 193 | void addNode(uint64_t id); 194 | void removeNode(uint64_t id); 195 | 196 | // pastElectionTimeout returns true if electionElapsed_ >= randomizedElectionTimeout_ 197 | bool pastElectionTimeout(); 198 | 199 | // reset randomizedElectionTimeout_ to [electiontimeout, 2 * electiontimeout - 1] 200 | void resetRandomizedElectionTimeout(); 201 | 202 | void setProgress(uint64_t id, uint64_t match, uint64_t next); 203 | void abortLeaderTransfer(); 204 | void proxyMessage(const Message& msg); 205 | 206 | // read out messages,after call will clean the outMsgs_(only used in test) 207 | void readMessages(MessageVec *); 208 | 209 | // checkQuorumActive returns true if the quorum is active from 210 | // the view of the local raft state machine. 211 | // checkQuorumActive also resets all RecentActive to false. 212 | bool checkQuorumActive(); 213 | 214 | // send timeout msg 215 | void sendTimeoutNow(uint64_t to); 216 | 217 | void resetPendingConf(); 218 | }; 219 | 220 | extern raft* newRaft(Config *); 221 | string entryString(const Entry& entry); 222 | 223 | // different role's state machine functions,after `Raft' change state, 224 | // `stateStepFunc_' will be set to the proper function 225 | void stepLeader(raft *r, const Message& msg); 226 | void stepCandidate(raft* r, const Message& msg); 227 | void stepFollower(raft* r, const Message& msg); 228 | 229 | }; // namespace libraft 230 | 231 | #endif // __LIBRAFT_RAFT_H__ 232 | -------------------------------------------------------------------------------- /src/core/read_only.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #include "base/logger.h" 6 | #include "core/read_only.h" 7 | 8 | namespace libraft { 9 | 10 | readOnly::readOnly(ReadOnlyOption option) 11 | : option_(option) { 12 | } 13 | 14 | // addRequest adds a read only reuqest into readonly struct. 15 | // `index` is the commit index of the raft state machine when it received 16 | // the read only request. 17 | // `m` is the original read only request message from the local or remote node. 18 | void readOnly::addRequest(uint64_t index, Message *msg) { 19 | string ctx = msg->entries(0).data(); 20 | if (pendingReadIndex_.find(ctx) != pendingReadIndex_.end()) { 21 | return; 22 | } 23 | pendingReadIndex_[ctx] = new readIndexStatus(index, msg); 24 | readIndexQueue_.push_back(ctx); 25 | } 26 | 27 | // recvAck notifies the readonly struct that the raft state machine received 28 | // an acknowledgment of the heartbeat that attached with the read only request 29 | // context. 30 | int readOnly::recvAck(const Message& msg) { 31 | map::iterator iter = pendingReadIndex_.find(msg.context()); 32 | if (iter == pendingReadIndex_.end()) { 33 | return 0; 34 | } 35 | 36 | readIndexStatus* rs = iter->second; 37 | rs->acks_[msg.from()] = true; 38 | return rs->acks_.size() + 1; 39 | } 40 | 41 | // advance advances the read only request queue kept by the readonly struct. 42 | // It dequeues the requests until it finds the read only request that has 43 | // the same context as the given `m`. 44 | void readOnly::advance(const Message& msg, vector *rss) { 45 | size_t i; 46 | bool found = false; 47 | string ctx = msg.context(); 48 | 49 | for (i = 0; i < readIndexQueue_.size(); ++i) { 50 | map::iterator iter = pendingReadIndex_.find(ctx); 51 | if (iter == pendingReadIndex_.end()) { 52 | Fatalf("cannot find corresponding read state from pending map"); 53 | } 54 | 55 | readIndexStatus* rs = iter->second; 56 | rss->push_back(rs); 57 | if (ctx == readIndexQueue_[i]) { 58 | found = true; 59 | break; 60 | } 61 | } 62 | 63 | if (found) { 64 | ++i; 65 | readIndexQueue_.erase(readIndexQueue_.begin(), readIndexQueue_.begin() + i); 66 | for (i = 0; i < rss->size(); ++i) { 67 | pendingReadIndex_.erase((*rss)[i]->req_->entries(0).data()); 68 | } 69 | } 70 | } 71 | 72 | // lastPendingRequestCtx returns the context of the last pending read only 73 | // request in readonly struct. 74 | string readOnly::lastPendingRequestCtx() { 75 | if (readIndexQueue_.size() == 0) { 76 | return ""; 77 | } 78 | 79 | return readIndexQueue_[readIndexQueue_.size() - 1]; 80 | } 81 | 82 | }; // namespace libraft -------------------------------------------------------------------------------- /src/core/read_only.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #ifndef __LIBRAFT_READ_ONLY_H__ 6 | #define __LIBRAFT_READ_ONLY_H__ 7 | 8 | #include 9 | #include 10 | #include "core/raft.h" 11 | 12 | using namespace std; 13 | namespace libraft { 14 | 15 | struct readIndexStatus { 16 | uint64_t index; 17 | Message *req_; 18 | map acks_; 19 | 20 | readIndexStatus(uint64_t index, Message *msg) 21 | : index(index), 22 | req_(msg) { 23 | } 24 | }; 25 | 26 | struct readOnly { 27 | ReadOnlyOption option_; 28 | map pendingReadIndex_; 29 | vector readIndexQueue_; 30 | 31 | readOnly(ReadOnlyOption option); 32 | void addRequest(uint64_t index, Message *msg); 33 | int recvAck(const Message& msg); 34 | void advance(const Message& msg, vector* rss); 35 | string lastPendingRequestCtx(); 36 | }; 37 | 38 | }; // namespace libraft 39 | 40 | #endif // __LIBRAFT_READ_ONLY_H__ 41 | -------------------------------------------------------------------------------- /src/io/buffer_io_reader.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #include "io/buffer_io_reader.h" 6 | #include "io/io.h" 7 | #include "base/io_error.h" 8 | 9 | namespace libraft { 10 | static const uint32_t kBlockSize = 1024; 11 | 12 | struct Block { 13 | char buffer[kBlockSize]; 14 | uint32_t read_pos; 15 | uint32_t write_pos; 16 | Block* next; 17 | 18 | uint32_t WriteSize() { return kBlockSize - write_pos; } 19 | 20 | uint32_t ReadSize() { 21 | if (write_pos <= read_pos) { 22 | return 0; 23 | } 24 | return write_pos - read_pos; 25 | } 26 | 27 | size_t Read(char* buf, size_t size) { 28 | memcpy(buf, ReadPos(), size); 29 | advanceRead(size); 30 | return size; 31 | } 32 | 33 | size_t ReadAll(char* buf, size_t size) { 34 | size_t readSize = ReadSize(); 35 | if (size > readSize) { 36 | size = readSize; 37 | } 38 | memcpy(buf, ReadPos(), readSize); 39 | advanceRead(readSize); 40 | return readSize; 41 | } 42 | 43 | void AdvanceWrite(uint32_t wpos) { write_pos += wpos;} 44 | 45 | char* WritePos() { return buffer + write_pos; } 46 | 47 | void advanceRead(uint32_t rpos) { 48 | read_pos += rpos; 49 | } 50 | 51 | char* ReadPos() { return buffer + read_pos; } 52 | 53 | Block() 54 | : read_pos(0), 55 | write_pos(), 56 | next(NULL) { 57 | } 58 | 59 | ~Block() { 60 | if (next) { 61 | delete next; 62 | } 63 | } 64 | }; 65 | 66 | BufferIOReader::BufferIOReader(IOReader* reader) 67 | : head_(new Block()), 68 | read_block_(head_), 69 | write_block_(head_), 70 | reader_(reader) { 71 | } 72 | 73 | BufferIOReader::~BufferIOReader() { 74 | delete head_; 75 | delete reader_; 76 | } 77 | 78 | void 79 | BufferIOReader::readAtLeast(size_t size) { 80 | int err; 81 | 82 | while (size >= 0) { 83 | if (write_block_->WriteSize() == 0) { 84 | write_block_->next = new Block(); 85 | write_block_ = write_block_->next; 86 | } 87 | uint32_t wsize = write_block_->WriteSize() >= kBlockSize ? kBlockSize : write_block_->WriteSize(); 88 | size_t rsize = reader_->Read(write_block_->WritePos(), wsize, &err); 89 | write_block_->AdvanceWrite(rsize); 90 | size -= rsize; 91 | if (err != kOK) { 92 | break; 93 | } 94 | } 95 | } 96 | 97 | size_t 98 | BufferIOReader::Read(char* buf, size_t size, int* err) { 99 | // is the buffer has enough data? 100 | if (read_block_->ReadSize() >= size) { 101 | read_block_->Read(buf, size); 102 | *err = kOK; 103 | return size; 104 | } 105 | 106 | readAtLeast(size); 107 | size_t readSize = read_block_->ReadAll(buf, size); 108 | *err = (readSize == size) ? kOK : kEOF; 109 | return readSize; 110 | } 111 | 112 | int 113 | BufferIOReader::ReadInt64(int64_t* ret) { 114 | if (read_block_->ReadSize() >= sizeof(int64_t)) { 115 | read_block_->Read((char*)ret, sizeof(int64_t)); 116 | return 0; 117 | } 118 | 119 | readAtLeast(sizeof(int64_t)); 120 | if (read_block_->ReadSize() >= sizeof(int64_t)) { 121 | read_block_->Read((char*)ret, sizeof(int64_t)); 122 | return 0; 123 | } 124 | 125 | char tmp[sizeof(int64_t)]; 126 | read_block_->Read(tmp, sizeof(int64_t)); 127 | 128 | return 0; 129 | } 130 | 131 | }; -------------------------------------------------------------------------------- /src/io/buffer_io_reader.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #ifndef __LIBRAFT_BUFFER_IO_READER_H__ 6 | #define __LIBRAFT_BUFFER_IO_READER_H__ 7 | 8 | #include 9 | 10 | using namespace std; 11 | 12 | namespace libraft { 13 | 14 | class IOReader; 15 | struct Block; 16 | 17 | class BufferIOReader { 18 | public: 19 | BufferIOReader(IOReader* reader); 20 | 21 | virtual ~BufferIOReader(); 22 | 23 | virtual size_t Read(char* buf, size_t size, int* err); 24 | 25 | int ReadInt64(int64_t* ret); 26 | 27 | private: 28 | void readAtLeast(size_t size); 29 | 30 | private: 31 | Block *head_; 32 | Block* read_block_; 33 | Block* write_block_; 34 | IOReader* reader_; 35 | }; 36 | 37 | }; 38 | 39 | #endif // __LIBRAFT_BUFFER_IO_READER_H__ -------------------------------------------------------------------------------- /src/io/file_io.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #ifndef __LIBRAFT_FILE_IO_H__ 6 | #define __LIBRAFT_FILE_IO_H__ 7 | 8 | #include "io/io.h" 9 | 10 | namespace libraft { 11 | 12 | class File; 13 | 14 | class FileIO : public IOReader, 15 | public IOWriter { 16 | public: 17 | virtual ~FileIO() {} 18 | 19 | virtual size_t Read(char* buf, size_t size, int* err); 20 | virtual size_t Write(const char* buf, size_t size, int* err); 21 | 22 | private: 23 | File* file_; 24 | }; 25 | 26 | }; 27 | 28 | #endif // __LIBRAFT_FILE_IO_H__ 29 | -------------------------------------------------------------------------------- /src/io/io.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #ifndef __LIBRAFT_IO_H__ 6 | #define __LIBRAFT_IO_H__ 7 | 8 | namespace libraft { 9 | class IOReader { 10 | public: 11 | virtual ~IOReader() {} 12 | 13 | // read at least size bytes data, return actual read size and error code 14 | virtual size_t Read(char* buf, size_t size, int* err) = 0; 15 | }; 16 | 17 | class IOWriter { 18 | public: 19 | virtual ~IOWriter() {} 20 | 21 | virtual size_t Write(const char* buf, size_t size, int* err) = 0; 22 | }; 23 | 24 | }; 25 | 26 | #endif // __LIBRAFT_IO_H__ 27 | -------------------------------------------------------------------------------- /src/io/memory_io.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #ifndef __LIBRAFT_MEMORY_IO_H__ 6 | #define __LIBRAFT_MEMORY_IO_H__ 7 | 8 | #include "io/io.h" 9 | 10 | namespace libraft { 11 | 12 | class File; 13 | 14 | class MemoryIO : public IOReader, 15 | public IOWriter { 16 | public: 17 | virtual ~MemoryIO() {} 18 | 19 | virtual size_t Read(char* buf, size_t size, int* err); 20 | virtual size_t Write(const char* buf, size_t size, int* err); 21 | 22 | private: 23 | File* file_; 24 | }; 25 | 26 | }; 27 | 28 | #endif // __LIBRAFT_MEMORY_IO_H__ 29 | -------------------------------------------------------------------------------- /src/proto/raft.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | package raftpb; 3 | 4 | enum EntryType { 5 | EntryNormal = 0; 6 | EntryConfChange = 1; 7 | } 8 | 9 | message Entry { 10 | required EntryType Type = 1; 11 | required uint64 Term = 2; 12 | required uint64 Index = 3; 13 | optional bytes Data = 4; 14 | } 15 | 16 | message ConfState { 17 | repeated uint64 nodes = 1; 18 | } 19 | 20 | message SnapshotMetadata { 21 | optional ConfState conf_state = 1; 22 | optional uint64 index = 2; 23 | optional uint64 term = 3; 24 | } 25 | 26 | message Snapshot { 27 | optional bytes data = 1; 28 | optional SnapshotMetadata metadata = 2; 29 | } 30 | 31 | enum MessageType { 32 | MsgHup = 0; 33 | MsgBeat = 1; 34 | MsgProp = 2; 35 | MsgApp = 3; 36 | MsgAppResp = 4; 37 | MsgVote = 5; 38 | MsgVoteResp = 6; 39 | MsgSnap = 7; 40 | MsgHeartbeat = 8; 41 | MsgHeartbeatResp = 9; 42 | MsgUnreachable = 10; 43 | MsgSnapStatus = 11; 44 | MsgCheckQuorum = 12; 45 | MsgTransferLeader = 13; 46 | MsgTimeoutNow = 14; 47 | MsgReadIndex = 15; 48 | MsgReadIndexResp = 16; 49 | MsgPreVote = 17; 50 | MsgPreVoteResp = 18; 51 | } 52 | 53 | message Message { 54 | optional MessageType type = 1; 55 | optional uint64 to = 2; 56 | optional uint64 from = 3; 57 | optional uint64 term = 4; 58 | optional uint64 logTerm = 5; 59 | optional uint64 index = 6; 60 | repeated Entry entries = 7; 61 | optional uint64 commit = 8; 62 | optional Snapshot snapshot = 9; 63 | optional bool reject = 10; 64 | optional uint64 rejectHint = 11; 65 | optional bytes context = 12; 66 | } 67 | 68 | message HardState { 69 | optional uint64 term = 1; 70 | optional uint64 vote = 2; 71 | optional uint64 commit = 3; 72 | } 73 | 74 | enum ConfChangeType { 75 | ConfChangeAddNode = 0; 76 | ConfChangeRemoveNode = 1; 77 | ConfChangeUpdateNode = 2; 78 | } 79 | 80 | message ConfChange { 81 | optional uint64 ID = 1; 82 | optional ConfChangeType Type = 2; 83 | optional uint64 NodeID = 3; 84 | optional bytes Context = 4; 85 | } 86 | -------------------------------------------------------------------------------- /src/proto/record.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | package walpb; 3 | 4 | message Record { 5 | optional int64 type = 1; 6 | optional uint32 crc = 2; 7 | optional bytes data = 3; 8 | } 9 | 10 | message WalSnapshot { 11 | optional uint64 index = 1; 12 | optional uint64 term = 2; 13 | } 14 | -------------------------------------------------------------------------------- /src/storage/log.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #include "base/logger.h" 6 | #include "base/util.h" 7 | #include "storage/log.h" 8 | 9 | namespace libraft { 10 | 11 | raftLog::raftLog(Storage *storage) 12 | : storage_(storage), 13 | committed_(0), 14 | applied_(0) { 15 | } 16 | 17 | raftLog::~raftLog() { 18 | delete storage_; 19 | } 20 | 21 | // maybeAppend returns false if the entries cannot be appended. Otherwise, 22 | // it returns last index of new entries. 23 | bool 24 | raftLog::maybeAppend(uint64_t index, uint64_t logTerm, 25 | uint64_t committed, const EntryVec& entries, uint64_t *lasti) { 26 | *lasti = 0; 27 | 28 | // check if log index and term match 29 | if (!matchTerm(index, logTerm)) { 30 | return false; 31 | } 32 | 33 | uint64_t lastNewI, ci, offset; 34 | 35 | lastNewI = index + (uint64_t)entries.size(); 36 | 37 | // check if there is conflict entries 38 | ci = findConflict(entries); 39 | 40 | // if conflict entries is already committed 41 | if (ci != 0 && ci <= committed_) { 42 | Fatalf("entry %llu conflict with committed entry [committed(%llu)]", ci, committed_); 43 | } 44 | 45 | if (ci != 0) { 46 | offset = index + 1; 47 | EntryVec appendEntries(entries.begin() + ci - offset, entries.end()); 48 | append(appendEntries); 49 | } 50 | 51 | commitTo(min(committed, lastNewI)); 52 | *lasti = lastNewI; 53 | 54 | return true; 55 | } 56 | 57 | void 58 | raftLog::commitTo(uint64_t tocommit) { 59 | // never decrease commit 60 | if (committed_ >= tocommit) { 61 | return; 62 | } 63 | 64 | // to commit index cannot bigger than last index 65 | if (lastIndex() < tocommit) { 66 | Fatalf( 67 | "tocommit(%llu) is out of range [lastIndex(%llu)]. Was the raft log corrupted, truncated, or lost?", 68 | tocommit, lastIndex()); 69 | } 70 | 71 | committed_ = tocommit; 72 | Debugf("commit to %llu", committed_); 73 | } 74 | 75 | void 76 | raftLog::appliedTo(uint64_t i) { 77 | if (i == 0) { 78 | return; 79 | } 80 | 81 | // applied index cannot bigger than committed index, 82 | // also cannot smaller than already applied index. 83 | if (committed_ < i || i < applied_) { 84 | Fatalf("applied(%llu) is out of range [prevApplied(%llu), committed(%llu)]", i, applied_, committed_); 85 | } 86 | applied_ = i; 87 | } 88 | 89 | void 90 | raftLog::stableTo(uint64_t i, uint64_t t) { 91 | unstable_.stableTo(i, t); 92 | } 93 | 94 | void 95 | raftLog::stableSnapTo(uint64_t i) { 96 | unstable_.stableSnapTo(i); 97 | } 98 | 99 | uint64_t 100 | raftLog::lastTerm() { 101 | int err; 102 | uint64_t t; 103 | 104 | err = term(lastIndex(), &t); 105 | if (!SUCCESS(err)) { 106 | Fatalf("unexpected error when getting the last term (%s)", kErrString[err]); 107 | } 108 | 109 | return t; 110 | } 111 | 112 | int 113 | raftLog::entries(uint64_t i, uint64_t maxSize, EntryVec *entries) { 114 | entries->clear(); 115 | uint64_t lasti = lastIndex(); 116 | 117 | // valid index check 118 | if (i > lasti) { 119 | return OK; 120 | } 121 | 122 | return slice(i, lasti + 1, maxSize, entries); 123 | } 124 | 125 | // allEntries returns all entries in the log. 126 | void 127 | raftLog::allEntries(EntryVec *entries) { 128 | int err = this->entries(firstIndex(), kNoLimit, entries); 129 | if (SUCCESS(err)) { 130 | return; 131 | } 132 | 133 | if (err == ErrCompacted) { // try again if there was a racing compaction 134 | return allEntries(entries); 135 | } 136 | Fatalf("allEntries fatal: %s", kErrString[err]); 137 | } 138 | 139 | // isUpToDate determines if the given (lastIndex,term) log is more up-to-date 140 | // by comparing the index and term of the last entries in the existing logs. 141 | // If the logs have last entries with different terms, then the log with the 142 | // later term is more up-to-date. If the logs end with the same term, then 143 | // whichever log has the larger lastIndex is more up-to-date. If the logs are 144 | // the same, the given log is up-to-date. 145 | bool 146 | raftLog::isUpToDate(uint64_t lasti, uint64_t term) { 147 | uint64_t lastT = lastTerm(); 148 | return term > lastT || (term == lastT && lasti >= lastIndex()); 149 | } 150 | 151 | bool 152 | raftLog::maybeCommit(uint64_t maxIndex, uint64_t term) { 153 | uint64_t t; 154 | int err = this->term(maxIndex, &t); 155 | if (maxIndex > committed_ && zeroTermOnErrCompacted(t, err) == term) { 156 | commitTo(maxIndex); 157 | return true; 158 | } 159 | return false; 160 | } 161 | 162 | void 163 | raftLog::restore(const Snapshot& snapshot) { 164 | Infof("log [%s] starts to restore snapshot [index: %llu, term: %llu]", 165 | String().c_str(), snapshot.metadata().index(), snapshot.metadata().term()); 166 | committed_ = snapshot.metadata().index(); 167 | unstable_.restore(snapshot); 168 | } 169 | 170 | // append entries to unstable storage and return last index 171 | // fatal if the first index of entries < committed_ 172 | uint64_t 173 | raftLog::append(const EntryVec& entries) { 174 | if (entries.empty()) { 175 | return lastIndex(); 176 | } 177 | 178 | uint64_t after = entries[0].index() - 1; 179 | if (after < committed_) { 180 | Fatalf("after(%llu) is out of range [committed(%llu)]", after, committed_); 181 | } 182 | 183 | unstable_.truncateAndAppend(entries); 184 | return lastIndex(); 185 | } 186 | 187 | // findConflict finds the index of the conflict. 188 | // It returns the first pair of conflicting entries between the existing 189 | // entries and the given entries, if there are any. 190 | // If there is no conflicting entries, and the existing entries contains 191 | // all the given entries, zero will be returned. 192 | // If there is no conflicting entries, but the given entries contains new 193 | // entries, the index of the first new entry will be returned. 194 | // An entry is considered to be conflicting if it has the same index but 195 | // a different term. 196 | // The first entry MUST have an index equal to the argument 'from'. 197 | // The index of the given entries MUST be continuously increasing. 198 | uint64_t raftLog::findConflict(const EntryVec& entries) { 199 | size_t i; 200 | for (i = 0; i < entries.size(); ++i) { 201 | if (!matchTerm(entries[i].index(), entries[i].term())) { 202 | const Entry& entry = entries[i]; 203 | uint64_t index = entry.index(); 204 | uint64_t term = entry.term(); 205 | 206 | if (index <= lastIndex()) { 207 | uint64_t dummy; 208 | int err = this->term(index, &dummy); 209 | Infof("found conflict at index %llu [existing term: %llu, conflicting term: %llu]", 210 | index, zeroTermOnErrCompacted(dummy, err), term); 211 | } 212 | 213 | return index; 214 | } 215 | } 216 | 217 | return 0; 218 | } 219 | 220 | void 221 | raftLog::unstableEntries(EntryVec *entries) { 222 | entries->clear(); 223 | size_t i; 224 | for (i = 0; i < unstable_.entries_.size(); ++i) { 225 | entries->push_back(unstable_.entries_[i]); 226 | } 227 | } 228 | 229 | // nextEntries returns all the available entries for execution. 230 | // If applied is smaller than the index of snapshot, it returns all committed 231 | // entries after the index of snapshot. 232 | void 233 | raftLog::nextEntries(EntryVec* entries) { 234 | entries->clear(); 235 | uint64_t offset = max(applied_ + 1, firstIndex()); 236 | if (committed_ + 1 > offset) { 237 | int err = slice(offset, committed_ + 1, kNoLimit, entries); 238 | if (!SUCCESS(err)) { 239 | Fatalf("unexpected error when getting unapplied entries (%s)", kErrString[err]); 240 | } 241 | } 242 | } 243 | 244 | string 245 | raftLog::String() { 246 | char tmp[200]; 247 | snprintf(tmp, sizeof(tmp), "committed=%llu, applied=%llu, unstable.offset=%llu, len(unstable.Entries)=%lu", 248 | committed_, applied_, unstable_.offset_, unstable_.entries_.size()); 249 | 250 | return tmp; 251 | } 252 | 253 | // hasNextEntries returns if there is any available entries for execution. This 254 | // is a fast check without heavy raftLog.slice() in raftLog.nextEnts(). 255 | bool 256 | raftLog::hasNextEntries() { 257 | return committed_ + 1 > max(applied_ + 1, firstIndex()); 258 | } 259 | 260 | int 261 | raftLog::snapshot(Snapshot **snapshot) { 262 | // first check unstable storage 263 | if (unstable_.snapshot_ != NULL) { 264 | *snapshot = unstable_.snapshot_; 265 | return OK; 266 | } 267 | 268 | // then get snapshot from storage 269 | return storage_->GetSnapshot(snapshot); 270 | } 271 | 272 | // check err code, if success return term, 273 | // return 0 if error code is ErrCompacted 274 | // others Fatal 275 | uint64_t 276 | raftLog::zeroTermOnErrCompacted(uint64_t t, int err) { 277 | if (SUCCESS(err)) { 278 | return t; 279 | } 280 | 281 | if (err == ErrCompacted) { 282 | return 0; 283 | } 284 | 285 | Fatalf("unexpected error: %s", kErrString[err]); 286 | return 0; 287 | } 288 | 289 | bool 290 | raftLog::matchTerm(uint64_t i, uint64_t term) { 291 | int err; 292 | uint64_t t; 293 | 294 | err = this->term(i, &t); 295 | if (!SUCCESS(err)) { 296 | return false; 297 | } 298 | 299 | return t == term; 300 | } 301 | 302 | // returns the term of the entry at index i, if there is any. 303 | int 304 | raftLog::term(uint64_t i, uint64_t *t) { 305 | uint64_t dummyIndex; 306 | int err = OK; 307 | 308 | *t = 0; 309 | // the valid term range is [index of dummy entry, last index] 310 | dummyIndex = firstIndex() - 1; 311 | if (i < dummyIndex || i > lastIndex()) { 312 | return OK; 313 | } 314 | 315 | // first check in unstable storage 316 | bool ok = unstable_.maybeTerm(i, t); 317 | if (ok) { 318 | return err; 319 | } 320 | 321 | // then check in stable storage 322 | err = storage_->Term(i, t); 323 | if (SUCCESS(err)) { 324 | return err; 325 | } 326 | 327 | if (err == ErrCompacted || err == ErrUnavailable) { 328 | return err; 329 | } 330 | Fatalf("term err:%s", kErrString[err]); 331 | 332 | return err; 333 | } 334 | 335 | uint64_t 336 | raftLog::firstIndex() { 337 | uint64_t i; 338 | int err; 339 | 340 | bool ok = unstable_.maybeFirstIndex(&i); 341 | if (ok) { 342 | return i; 343 | } 344 | 345 | err = storage_->FirstIndex(&i); 346 | if (!SUCCESS(err)) { 347 | Fatalf("firstIndex error:%s", kErrString[err]); 348 | } 349 | 350 | return i; 351 | } 352 | 353 | uint64_t 354 | raftLog::lastIndex() { 355 | uint64_t i; 356 | int err; 357 | 358 | bool ok = unstable_.maybeLastIndex(&i); 359 | if (ok) { 360 | return i; 361 | } 362 | 363 | err = storage_->LastIndex(&i); 364 | if (!SUCCESS(err)) { 365 | Fatalf("lastIndex error:%s", kErrString[err]); 366 | } 367 | 368 | return i; 369 | } 370 | 371 | // slice returns a slice of log entries from lo through hi-1, inclusive. 372 | int raftLog::slice(uint64_t lo, uint64_t hi, uint64_t maxSize, EntryVec* entries) { 373 | int err; 374 | 375 | // first check if index out of bounds 376 | err = mustCheckOutOfBounds(lo, hi); 377 | if (!SUCCESS(err)) { 378 | return err; 379 | } 380 | 381 | if (lo == hi) { 382 | return OK; 383 | } 384 | 385 | // if lo index in unstable storage 386 | if (lo < unstable_.offset_) { 387 | err = storage_->Entries(lo, min(hi,unstable_.offset_), maxSize, entries); 388 | if (err == ErrCompacted) { 389 | return err; 390 | } else if (err == ErrUnavailable) { 391 | Fatalf("entries[%llu:%llu) is unavailable from storage", lo, min(hi, unstable_.offset_)); 392 | } else if (!SUCCESS(err)) { 393 | Fatalf("storage entries err:%s", kErrString[err]); 394 | } 395 | 396 | if ((uint64_t)entries->size() < min(hi, unstable_.offset_) - lo) { 397 | return OK; 398 | } 399 | } 400 | 401 | // if hi index not in unstable storage 402 | if (hi > unstable_.offset_) { 403 | EntryVec unstable; 404 | unstable_.slice(max(lo, unstable_.offset_), hi, &unstable); 405 | if (entries->size() > 0) { 406 | entries->insert(entries->end(), unstable.begin(), unstable.end()); 407 | } else { 408 | *entries = unstable; 409 | } 410 | } 411 | 412 | limitSize(maxSize, entries); 413 | return OK; 414 | } 415 | 416 | int 417 | raftLog::mustCheckOutOfBounds(uint64_t lo, uint64_t hi) { 418 | if (lo > hi) { 419 | Fatalf("invalid slice %llu > %llu", lo, hi); 420 | } 421 | 422 | uint64_t fi = firstIndex(); 423 | if (lo < fi) { 424 | return ErrCompacted; 425 | } 426 | 427 | uint64_t li = lastIndex(); 428 | if (hi > li + 1) { 429 | Fatalf("slice[%llu,%llu) out of bound [%llu,%llu]", lo, hi, fi, li); 430 | } 431 | 432 | return OK; 433 | } 434 | 435 | // newLog returns log using the given storage. It recovers the log to the state 436 | // that it just commits and applies the latest snapshot. 437 | raftLog* 438 | newLog(Storage *storage) { 439 | raftLog *log = new raftLog(storage); 440 | 441 | uint64_t firstIndex, lastIndex; 442 | int err; 443 | 444 | // init the first and last log index 445 | err = storage->FirstIndex(&firstIndex); 446 | if (!SUCCESS(err)) { 447 | Fatalf("get first index err:%s", kErrString[err]); 448 | } 449 | 450 | err = storage->LastIndex(&lastIndex); 451 | if (!SUCCESS(err)) { 452 | Fatalf("get last index err:%s", kErrString[err]); 453 | } 454 | 455 | log->unstable_.offset_ = lastIndex + 1; 456 | 457 | // Initialize our committed and applied pointers to the time of the last compaction. 458 | log->committed_ = firstIndex - 1; 459 | log->applied_ = firstIndex - 1; 460 | 461 | return log; 462 | } 463 | 464 | }; // namespace libraft -------------------------------------------------------------------------------- /src/storage/log.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #ifndef __LIBRAFT_LOG_H__ 6 | #define __LIBRAFT_LOG_H__ 7 | 8 | #include "libraft.h" 9 | #include "unstable_log.h" 10 | 11 | namespace libraft { 12 | 13 | // Raft log storage 14 | struct raftLog { 15 | // storage contains all stable entries since the last snapshot. 16 | Storage *storage_; 17 | 18 | // unstable contains all unstable entries and snapshot. 19 | // they will be saved into storage. 20 | unstableLog unstable_; 21 | 22 | // committed is the highest log position that is known to be in 23 | // stable storage on a quorum of nodes. 24 | uint64_t committed_; 25 | 26 | // applied is the highest log position that the application has 27 | // been instructed to apply to its state machine. 28 | // Invariant: applied <= committed 29 | uint64_t applied_; 30 | 31 | raftLog(Storage *); 32 | ~raftLog(); 33 | 34 | string String(); 35 | 36 | // maybeAppend returns false if the entries cannot be appended. Otherwise, 37 | // it returns last index of new entries. 38 | bool maybeAppend(uint64_t index, uint64_t logTerm, 39 | uint64_t committed, const EntryVec& entries, uint64_t *lasti); 40 | 41 | // append entries to unstable storage and return last index 42 | uint64_t append(const EntryVec& entries); 43 | 44 | // finds the index of the conflict. 45 | uint64_t findConflict(const EntryVec& entries); 46 | 47 | // get all unstable entries 48 | void unstableEntries(EntryVec *entries); 49 | 50 | // nextEntries returns all the available entries for execution. 51 | void nextEntries(EntryVec* entries); 52 | 53 | // hasNextEntries returns if there is any available entries for execution. 54 | bool hasNextEntries(); 55 | 56 | // return snapshot of raft log 57 | int snapshot(Snapshot **snapshot); 58 | 59 | uint64_t firstIndex(); 60 | 61 | uint64_t lastIndex(); 62 | 63 | // change committed_ index to tocommit 64 | void commitTo(uint64_t tocommit); 65 | 66 | // change applied index to i 67 | void appliedTo(uint64_t i); 68 | 69 | // unstable storage stable index to log 70 | void stableTo(uint64_t i, uint64_t t); 71 | 72 | // unstable storage stable index to snapshot 73 | void stableSnapTo(uint64_t i); 74 | 75 | // get last index term 76 | uint64_t lastTerm(); 77 | 78 | // get entries from index i, no more than maxSize 79 | int entries(uint64_t i, uint64_t maxSize, EntryVec *entries); 80 | 81 | // allEntries returns all entries in the log. 82 | void allEntries(EntryVec *entries); 83 | 84 | // isUpToDate determines if the given (lastIndex,term) log is more up-to-date 85 | // by comparing the index and term of the last entries in the existing logs. 86 | bool isUpToDate(uint64_t lasti, uint64_t term); 87 | 88 | // return true if the term of the index equal to term 89 | bool matchTerm(uint64_t i, uint64_t term); 90 | 91 | // return true if maxIndex committed 92 | bool maybeCommit(uint64_t maxIndex, uint64_t term); 93 | 94 | // restore from snapshot 95 | void restore(const Snapshot& snapshot); 96 | 97 | // slice returns a slice of log entries from lo through hi-1, inclusive. 98 | int slice(uint64_t lo, uint64_t hi, uint64_t maxSize, EntryVec* entries); 99 | 100 | // check if [lo,hi] is out of bounds 101 | int mustCheckOutOfBounds(uint64_t lo, uint64_t hi); 102 | 103 | // returns the term of the entry at index i, if there is any. 104 | int term(uint64_t i, uint64_t *t); 105 | 106 | uint64_t zeroTermOnErrCompacted(uint64_t t, int err); 107 | }; 108 | 109 | // newLog returns log using the given storage. It recovers the log to the state 110 | // that it just commits and applies the latest snapshot. 111 | extern raftLog* newLog(Storage *storage); 112 | 113 | }; // namespace libraft 114 | 115 | #endif // __LIBRAFT_LOG_H__ 116 | -------------------------------------------------------------------------------- /src/storage/memory_storage.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #include "base/logger.h" 6 | #include "base/util.h" 7 | #include "storage/memory_storage.h" 8 | 9 | namespace libraft { 10 | 11 | MemoryStorage::MemoryStorage(EntryVec* entries) 12 | : snapShot_(new Snapshot()) { 13 | if (entries == NULL) { 14 | // When starting from scratch populate the list with a dummy entry at term zero. 15 | entries_.push_back(Entry()); 16 | } else { 17 | for (size_t i = 0; i < entries->size(); ++i) { 18 | entries_.push_back((*entries)[i]); 19 | } 20 | } 21 | } 22 | 23 | MemoryStorage::~MemoryStorage() { 24 | delete snapShot_; 25 | } 26 | 27 | int 28 | MemoryStorage::InitialState(HardState *hs, ConfState *cs) { 29 | *hs = hardState_; 30 | *cs = snapShot_->metadata().conf_state(); 31 | return OK; 32 | } 33 | 34 | int 35 | MemoryStorage::SetHardState(const HardState& hs) { 36 | Mutex mutex(&locker_); 37 | hardState_ = hs; 38 | return OK; 39 | } 40 | 41 | uint64_t 42 | MemoryStorage::firstIndex() { 43 | return entries_[0].index() + 1; 44 | } 45 | 46 | int 47 | MemoryStorage::FirstIndex(uint64_t *index) { 48 | Mutex mutex(&locker_); 49 | *index = firstIndex(); 50 | return OK; 51 | } 52 | 53 | int 54 | MemoryStorage::LastIndex(uint64_t *index) { 55 | Mutex mutex(&locker_); 56 | *index = lastIndex(); 57 | return OK; 58 | } 59 | 60 | uint64_t 61 | MemoryStorage::lastIndex() { 62 | return entries_[0].index() + entries_.size() - 1; 63 | } 64 | 65 | int 66 | MemoryStorage::Term(uint64_t i, uint64_t *term) { 67 | Mutex mutex(&locker_); 68 | *term = 0; 69 | uint64_t offset = entries_[0].index(); 70 | if (i < offset) { 71 | return ErrCompacted; 72 | } 73 | if (i - offset >= entries_.size()) { 74 | return ErrUnavailable; 75 | } 76 | *term = entries_[i - offset].term(); 77 | return OK; 78 | } 79 | 80 | int 81 | MemoryStorage::Entries(uint64_t lo, uint64_t hi, uint64_t maxSize, EntryVec *entries) { 82 | Mutex mutex(&locker_); 83 | 84 | // first check validity of index 85 | uint64_t offset = entries_[0].index(); 86 | if (lo <= offset) { 87 | return ErrCompacted; 88 | } 89 | if (hi > lastIndex() + 1) { 90 | return ErrUnavailable; 91 | } 92 | 93 | // only contains dummy entries. 94 | if (entries_.size() == 1) { 95 | return ErrUnavailable; 96 | } 97 | size_t i; 98 | for (i = lo - offset; i < hi - offset; ++i) { 99 | entries->push_back(entries_[i]); 100 | } 101 | limitSize(maxSize, entries); 102 | return OK; 103 | } 104 | 105 | int 106 | MemoryStorage::GetSnapshot(Snapshot **snapshot) { 107 | Mutex mutex(&locker_); 108 | *snapshot = snapShot_; 109 | return OK; 110 | } 111 | 112 | // Compact discards all log entries prior to compactIndex. 113 | // It is the application's responsibility to not attempt to compact an index 114 | // greater than raftLog.applied. 115 | int 116 | MemoryStorage::Compact(uint64_t compactIndex) { 117 | Mutex mutex(&locker_); 118 | 119 | uint64_t offset = entries_[0].index(); 120 | if (compactIndex <= offset) { 121 | return ErrCompacted; 122 | } 123 | if (compactIndex > lastIndex()) { 124 | Fatalf("compact %llu is out of bound lastindex(%llu)", compactIndex, lastIndex()); 125 | } 126 | 127 | uint64_t i = compactIndex - offset; 128 | EntryVec entries; 129 | Entry entry; 130 | entry.set_index(entries_[i].index()); 131 | entry.set_term(entries_[i].term()); 132 | entries.push_back(entry); 133 | for (i = i + 1; i < entries_.size(); ++i) { 134 | entries.push_back(entries_[i]); 135 | } 136 | entries_ = entries; 137 | return OK; 138 | } 139 | 140 | // ApplySnapshot overwrites the contents of this Storage object with 141 | // those of the given snapshot. 142 | int 143 | MemoryStorage::ApplySnapshot(const Snapshot& snapshot) { 144 | Mutex mutex(&locker_); 145 | 146 | //handle check for old snapshot being applied 147 | uint64_t index = snapShot_->metadata().index(); 148 | uint64_t snapIndex = snapshot.metadata().index(); 149 | if (index >= snapIndex) { 150 | return ErrSnapOutOfDate; 151 | } 152 | 153 | snapShot_->CopyFrom(snapshot); 154 | entries_.clear(); 155 | Entry entry; 156 | entry.set_index(snapshot.metadata().index()); 157 | entry.set_term(snapshot.metadata().term()); 158 | entries_.push_back(entry); 159 | return OK; 160 | } 161 | 162 | // Append the new entries to storage. 163 | // entries[0].Index > ms.entries[0].Index 164 | int 165 | MemoryStorage::Append(const EntryVec& entries) { 166 | if (entries.empty()) { 167 | return OK; 168 | } 169 | 170 | Mutex mutex(&locker_); 171 | size_t i; 172 | 173 | uint64_t first = firstIndex(); 174 | uint64_t last = entries[0].index() + entries.size() - 1; 175 | 176 | // shortcut if there is no new entry. 177 | if (last < first) { 178 | return OK; 179 | } 180 | 181 | EntryVec appendEntries; 182 | // truncate compacted entries 183 | if (first > entries[0].index()) { 184 | uint64_t index = first - entries[0].index(); 185 | appendEntries = EntryVec(entries.begin() + index, entries.end()); 186 | } else { 187 | appendEntries = entries; 188 | } 189 | 190 | uint64_t offset = appendEntries[0].index() - entries_[0].index(); 191 | // TODO: optimize vector copy 192 | if (entries_.size() > offset) { 193 | EntryVec tmp_ents; 194 | //tmp_ents.push_back(Entry()); 195 | for (i = 0; i < offset; ++i) { 196 | tmp_ents.push_back(entries_[i]); 197 | } 198 | entries_.clear(); 199 | entries_ = tmp_ents; 200 | for (i = 0; i < appendEntries.size(); ++i) { 201 | entries_.push_back(appendEntries[i]); 202 | } 203 | return OK; 204 | } 205 | 206 | if (entries_.size() == offset) { 207 | for (i = 0; i < appendEntries.size(); ++i) { 208 | entries_.push_back(appendEntries[i]); 209 | } 210 | return OK; 211 | } 212 | 213 | Fatalf("missing log entry [last: %llu, append at: %llu]", 214 | lastIndex(), appendEntries[0].index()); 215 | return OK; 216 | } 217 | 218 | // CreateSnapshot makes a snapshot which can be retrieved with Snapshot() and 219 | // can be used to reconstruct the state at that point. 220 | // If any configuration changes have been made since the last compaction, 221 | // the result of the last ApplyConfChange must be passed in. 222 | int 223 | MemoryStorage::CreateSnapshot(uint64_t i, ConfState *cs, const string& data, Snapshot *ss) { 224 | Mutex mutex(&locker_); 225 | 226 | if (i <= snapShot_->metadata().index()) { 227 | return ErrSnapOutOfDate; 228 | } 229 | 230 | uint64_t offset = entries_[0].index(); 231 | if (i > lastIndex()) { 232 | Fatalf("snapshot %d is out of bound lastindex(%llu)", i, lastIndex()); 233 | } 234 | 235 | snapShot_->mutable_metadata()->set_index(i); 236 | snapShot_->mutable_metadata()->set_term(entries_[i - offset].term()); 237 | if (cs != NULL) { 238 | *(snapShot_->mutable_metadata()->mutable_conf_state()) = *cs; 239 | } 240 | snapShot_->set_data(data); 241 | if (ss != NULL) { 242 | *ss = *snapShot_; 243 | } 244 | 245 | return OK; 246 | } 247 | 248 | }; // namespace libraft -------------------------------------------------------------------------------- /src/storage/memory_storage.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #ifndef __LIBRAFT_MEMORY_STORAGE_H__ 6 | #define __LIBRAFT_MEMORY_STORAGE_H__ 7 | 8 | #include "libraft.h" 9 | #include "base/mutex.h" 10 | 11 | namespace libraft { 12 | 13 | // MemoryStorage implements the Storage interface backed by an 14 | // in-memory array. 15 | class MemoryStorage : public Storage { 16 | public: 17 | MemoryStorage(EntryVec* entries); 18 | virtual ~MemoryStorage(); 19 | 20 | int InitialState(HardState *, ConfState *); 21 | int FirstIndex(uint64_t *index); 22 | int LastIndex(uint64_t *index); 23 | int Term(uint64_t i, uint64_t *term); 24 | int Entries(uint64_t lo, uint64_t hi, uint64_t maxSize, EntryVec *entries); 25 | int GetSnapshot(Snapshot **snapshot); 26 | int SetHardState(const HardState& ); 27 | 28 | int Append(const EntryVec& entries); 29 | int Compact(uint64_t compactIndex); 30 | int ApplySnapshot(const Snapshot& snapshot); 31 | int CreateSnapshot(uint64_t i, ConfState *cs, const string& data, Snapshot *ss); 32 | 33 | private: 34 | uint64_t firstIndex(); 35 | uint64_t lastIndex(); 36 | 37 | public: 38 | HardState hardState_; 39 | Snapshot *snapShot_; 40 | 41 | // ents[i] has raft log position i+snapshot.Metadata.Index 42 | EntryVec entries_; 43 | 44 | // Protects access to all fields. Most methods of MemoryStorage are 45 | // run on the raft goroutine, but Append() is run on an application 46 | // goroutine. 47 | Locker locker_; 48 | }; 49 | 50 | }; // namespace libraft 51 | 52 | #endif // __LIBRAFT_MEMORY_STORAGE_H__ 53 | -------------------------------------------------------------------------------- /src/storage/unstable_log.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #include "base/logger.h" 6 | #include "storage/unstable_log.h" 7 | 8 | namespace libraft { 9 | 10 | // maybeFirstIndex returns the index of the first possible entry in entries 11 | // if it has a snapshot. 12 | bool 13 | unstableLog::maybeFirstIndex(uint64_t *first) { 14 | if (snapshot_ != NULL) { 15 | *first = snapshot_->metadata().index() + 1; 16 | return true; 17 | } 18 | 19 | *first = 0; 20 | return false; 21 | } 22 | 23 | // maybeLastIndex returns the last index if it has at least one 24 | // unstable entry or snapshot. 25 | bool 26 | unstableLog::maybeLastIndex(uint64_t *last) { 27 | *last = 0; 28 | // first check entities 29 | if (entries_.size() > 0) { 30 | *last = offset_ + entries_.size() - 1; 31 | return true; 32 | } 33 | 34 | // then check snapshot 35 | if (snapshot_ != NULL) { 36 | *last = snapshot_->metadata().index(); 37 | return true; 38 | } 39 | 40 | return false; 41 | } 42 | 43 | // maybeTerm returns the term of the entry at index i, if there 44 | // is any. 45 | bool 46 | unstableLog::maybeTerm(uint64_t i, uint64_t *term) { 47 | *term = 0; 48 | if (i < offset_) { 49 | if (snapshot_ == NULL) { 50 | return false; 51 | } 52 | if (snapshot_->metadata().index() == i) { 53 | *term = snapshot_->metadata().term(); 54 | return true; 55 | } 56 | 57 | return false; 58 | } 59 | 60 | uint64_t last; 61 | bool ok = maybeLastIndex(&last); 62 | if (!ok) { 63 | return false; 64 | } 65 | if (i > last) { 66 | return false; 67 | } 68 | *term = entries_[i - offset_].term(); 69 | return true; 70 | } 71 | 72 | void 73 | unstableLog::stableTo(uint64_t i, uint64_t t) { 74 | uint64_t gt; 75 | bool ok = maybeTerm(i, >); 76 | if (!ok) { 77 | return; 78 | } 79 | 80 | // if i < offset, term is matched with the snapshot 81 | // only update the unstable entries if term is matched with 82 | // an unstable entry. 83 | if (gt == t && i >= offset_) { 84 | entries_.erase(entries_.begin(), entries_.begin() + i + 1 - offset_); 85 | offset_ = i + 1; 86 | //Debugf("stable to %llu, entries size:%d, offset:%llu", i, entries_.size(), offset_); 87 | } 88 | } 89 | 90 | void 91 | unstableLog::stableSnapTo(uint64_t i) { 92 | if (snapshot_ != NULL && snapshot_->metadata().index() == i) { 93 | delete snapshot_; 94 | snapshot_ = NULL; 95 | } 96 | } 97 | 98 | void 99 | unstableLog::restore(const Snapshot& snapshot) { 100 | offset_ = snapshot.metadata().index() + 1; 101 | entries_.clear(); 102 | if (snapshot_ == NULL) { 103 | snapshot_ = new Snapshot(); 104 | } 105 | snapshot_->CopyFrom(snapshot); 106 | } 107 | 108 | void 109 | unstableLog::truncateAndAppend(const EntryVec& entries) { 110 | uint64_t after = entries[0].index(); 111 | 112 | if (after == offset_ + uint64_t(entries_.size())) { 113 | // after is the next index in the u.entries 114 | // directly append 115 | entries_.insert(entries_.end(), entries.begin(), entries.end()); 116 | Infof("ENTRY size: %d", entries_.size()); 117 | return; 118 | } 119 | 120 | if (after <= offset_) { 121 | // The log is being truncated to before our current offset 122 | // portion, so set the offset and replace the entries 123 | Infof("replace the unstable entries from index %llu", after); 124 | offset_ = after; 125 | entries_ = entries; 126 | return; 127 | } 128 | 129 | // truncate to after and copy to u.entries then append 130 | Infof("truncate the unstable entries before index %llu", after); 131 | vector slice; 132 | this->slice(offset_, after, &slice); 133 | entries_ = slice; 134 | entries_.insert(entries_.end(), entries.begin(), entries.end()); 135 | } 136 | 137 | void 138 | unstableLog::slice(uint64_t lo, uint64_t hi, EntryVec *entries) { 139 | mustCheckOutOfBounds(lo, hi); 140 | entries->assign(entries_.begin() + lo - offset_, entries_.begin() + hi - offset_); 141 | } 142 | 143 | // u.offset <= lo <= hi <= u.offset+len(u.offset) 144 | void 145 | unstableLog::mustCheckOutOfBounds(uint64_t lo, uint64_t hi) { 146 | if (lo > hi) { 147 | Fatalf("invalid unstable.slice %llu > %llu", lo, hi); 148 | } 149 | 150 | uint64_t upper = offset_ + (uint64_t)entries_.size(); 151 | if (lo < offset_ || upper < hi) { 152 | Fatalf("unstable.slice[%llu,%llu) out of bound [%llu,%llu]", lo, hi, offset_, upper); 153 | } 154 | } 155 | 156 | }; // namespace libraft -------------------------------------------------------------------------------- /src/storage/unstable_log.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #ifndef __LIBRAFT_UNSTABLE_LOG_H__ 6 | #define __LIBRAFT_UNSTABLE_LOG_H__ 7 | 8 | #include "libraft.h" 9 | 10 | namespace libraft { 11 | 12 | // unstable.entries[i] has raft log position i+unstable.offset. 13 | // Note that unstable.offset may be less than the highest log 14 | // position in storage; this means that the next write to storage 15 | // might need to truncate the log before persisting unstable.entries. 16 | struct unstableLog { 17 | unstableLog() : snapshot_(NULL) { 18 | } 19 | 20 | // the incoming unstable snapshot, if any. 21 | Snapshot* snapshot_; 22 | 23 | // all entries that have not yet been written to storage. 24 | EntryVec entries_; 25 | uint64_t offset_; 26 | 27 | void truncateAndAppend(const EntryVec& entries); 28 | 29 | // maybeFirstIndex returns the index of the first possible entry in entries 30 | // if it has a snapshot. 31 | bool maybeFirstIndex(uint64_t *first); 32 | 33 | // maybeLastIndex returns the last index if it has at least one 34 | // unstable entry or snapshot. 35 | bool maybeLastIndex(uint64_t* last); 36 | 37 | bool maybeTerm(uint64_t i, uint64_t *term); 38 | 39 | void stableTo(uint64_t i, uint64_t t); 40 | 41 | void stableSnapTo(uint64_t i); 42 | 43 | void restore(const Snapshot& snapshot); 44 | 45 | void slice(uint64_t lo, uint64_t hi, EntryVec *entries); 46 | 47 | void mustCheckOutOfBounds(uint64_t lo, uint64_t hi); 48 | }; 49 | 50 | }; // namespace libraft 51 | 52 | #endif // __LIBRAFT_UNSTABLE_LOG_H__ 53 | -------------------------------------------------------------------------------- /src/wal/decoder.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #include 6 | #include "base/crc32c.h" 7 | #include "base/io_buffer.h" 8 | #include "base/file.h" 9 | #include "io/buffer_io_reader.h" 10 | #include "wal/decoder.h" 11 | #include "wal/wal.h" 12 | 13 | namespace libraft { 14 | 15 | const uint32_t kMinSectorSize = 512; 16 | 17 | int 18 | Decoder::decode(Record* rec) { 19 | //rec->Reset(); 20 | locker.Lock(); 21 | 22 | return decodeRecord(rec); 23 | } 24 | 25 | int 26 | Decoder::decodeRecord(Record* rec) { 27 | if (readers.size() == 0) { 28 | return kEOF; 29 | } 30 | 31 | int err; 32 | int64_t lenField = 0; 33 | int64_t recBytes, padBytes; 34 | 35 | err = readers[0]->ReadInt64(&lenField); 36 | if (err == kEOF || lenField == 0) { 37 | // hit end of file or preallocated space 38 | BufferIOReader* reader = readers[0]; 39 | delete reader; 40 | readers.erase(readers.begin()); 41 | if (readers.size() == 0) { 42 | return kEOF; 43 | } 44 | 45 | lastValidOff = 0; 46 | return decodeRecord(rec); 47 | } 48 | 49 | if (err != kOK) { 50 | return err; 51 | } 52 | 53 | decodeFrameSize(lenField, &recBytes, &padBytes); 54 | 55 | char *data = new char[recBytes + padBytes]; 56 | memset(data, '\0', recBytes + padBytes); 57 | int total = readers[0]->Read(data, recBytes + padBytes, &err); 58 | (void)total; 59 | if (err != kOK) { 60 | // ReadFull returns io.kEOF only if no bytes were read 61 | // the Decoder should treat this as an kErrUnexpectedEOF instead. 62 | if (err == kEOF) { 63 | err = kErrUnexpectedEOF; 64 | goto out; 65 | } 66 | 67 | goto out; 68 | } 69 | 70 | if (!rec->ParseFromArray(data, recBytes)) { 71 | if (isTornEntry(data, recBytes + padBytes)) { 72 | err = kErrUnexpectedEOF; 73 | } else { 74 | err = kEOF; 75 | } 76 | goto out; 77 | } 78 | 79 | // skip crc checking if the record type is crcType 80 | if (rec->type() != crcType) { 81 | crc32 = Value(rec->data().c_str(),rec->data().length()); 82 | if (crc32 != rec->crc()) { 83 | if (isTornEntry(data, recBytes + padBytes)) { 84 | return kErrUnexpectedEOF; 85 | } 86 | return kErrCRCMismatch; 87 | } 88 | } 89 | 90 | // record decoded as valid; point last valid offset to end of record 91 | lastValidOff += recBytes + padBytes + 8; 92 | 93 | out: 94 | delete [] data; 95 | return err; 96 | } 97 | 98 | void 99 | Decoder::decodeFrameSize(int64_t lenField, int64_t* recBytes, int64_t* padBytes) { 100 | // the record size is stored in the lower 56 bits of the 64-bit length 101 | *recBytes = (int64_t)(((uint64_t)lenField) & (uint64_t(0xffff) ^ ((uint64_t)(0xff) << 56))); 102 | *padBytes = 0; 103 | 104 | // non-zero padding is indicated by set MSb / a negative length 105 | if (lenField < 0) { 106 | // padding is stored in lower 3 bits of length MSB 107 | *padBytes = int64_t(((uint64_t)lenField >> 56) & 0x7); 108 | } 109 | } 110 | 111 | // isTornEntry determines whether the last entry of the WAL was partially written 112 | // and corrupted because of a torn write. 113 | struct chunk { 114 | const unsigned char* buf; 115 | int32_t len; 116 | }; 117 | 118 | bool 119 | Decoder::isTornEntry(const char* data, uint32_t len) { 120 | if (readers.size() != 1) { 121 | return false; 122 | } 123 | 124 | int64_t fileOff = lastValidOff + 8; 125 | int64_t curOff = 0; 126 | 127 | vector chunks; 128 | 129 | // split data on sector boundaries 130 | for (; curOff < len;) { 131 | int32_t chunkLen = kMinSectorSize - (fileOff % kMinSectorSize); 132 | if (chunkLen > len - curOff) { 133 | chunkLen = len - curOff; 134 | } 135 | 136 | chunks.push_back((chunk){.buf = (unsigned char*)data + curOff, .len = chunkLen}); 137 | fileOff += chunkLen; 138 | curOff += chunkLen; 139 | } 140 | 141 | // if any data for a sector chunk is all 0, it's a torn write 142 | uint32_t i; 143 | for (i = 0; i < chunks.size(); i++) { 144 | chunk& c = chunks[i]; 145 | bool isZero = true; 146 | int j; 147 | for (j = 0; j < c.len; j++) { 148 | if (c.buf[j] != '\0') { 149 | isZero = false; 150 | break; 151 | } 152 | } 153 | if (isZero) { 154 | return true; 155 | } 156 | } 157 | 158 | return false; 159 | } 160 | 161 | Decoder::Decoder(const vector& reader) { 162 | uint32_t i; 163 | for (i = 0; i < reader.size(); ++i) { 164 | readers.push_back(new BufferIOReader(reader[i])); 165 | } 166 | 167 | lastValidOff = 0; 168 | crc32 = 0; 169 | } 170 | 171 | Decoder::~Decoder() { 172 | uint32_t i; 173 | for (i = 0; i < readers.size(); ++i) { 174 | delete readers[i]; 175 | } 176 | } 177 | 178 | Decoder* 179 | newDecoder(const vector& reader) { 180 | return new Decoder(reader); 181 | } 182 | 183 | }; // namespace libraft -------------------------------------------------------------------------------- /src/wal/decoder.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #ifndef __LIBRAFT_DECODER_H__ 6 | #define __LIBRAFT_DECODER_H__ 7 | 8 | #include 9 | #include "proto/record.pb.h" 10 | #include "base/mutex.h" 11 | 12 | using namespace std; 13 | using namespace walpb; 14 | 15 | namespace libraft { 16 | 17 | class BufferIOReader; 18 | class IOReader; 19 | 20 | struct Decoder { 21 | vector readers; 22 | 23 | // lastValidOff file offset following the last valid decoded record 24 | int64_t lastValidOff; 25 | 26 | uint32_t crc32; 27 | 28 | Locker locker; 29 | 30 | Decoder(const vector& reader); 31 | ~Decoder(); 32 | 33 | int decode(Record* rec); 34 | int decodeRecord(Record* rec); 35 | void decodeFrameSize(int64_t len, int64_t* recBytes, int64_t* padBytes); 36 | 37 | // isTornEntry determines whether the last entry of the WAL was partially written 38 | // and corrupted because of a torn write. 39 | bool isTornEntry(const char* data, uint32_t len); 40 | }; 41 | 42 | //extern Decoder* newDecoder(); 43 | extern Decoder* newDecoder(const vector& reader); 44 | 45 | }; // namespace libraft 46 | 47 | #endif // __LIBRAFT_DECODER_H__ -------------------------------------------------------------------------------- /src/wal/encoder.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #include "base/file.h" 6 | #include "base/crc32c.h" 7 | #include "wal/encoder.h" 8 | 9 | namespace libraft { 10 | 11 | encoder* 12 | newEncoder(File* file, uint32_t prevCrc, int32_t pageOffset) { 13 | encoder* ec = new encoder(); 14 | return ec; 15 | } 16 | 17 | int 18 | encoder::encode(Record* rec) { 19 | locker.Lock(); 20 | string data; 21 | rec->set_crc(Value(rec->data().c_str(), rec->data().size())); 22 | 23 | rec->SerializeToString(&data); 24 | 25 | uint64_t lenField; 26 | int64_t padBytes; 27 | encodeFrameSize(data.length(), &lenField, &padBytes); 28 | int err; 29 | err = file->WriteUint64(lenField); 30 | if (err != kOK) { 31 | return err; 32 | } 33 | 34 | if (padBytes != 0) { 35 | data += string('\0', padBytes); 36 | } 37 | 38 | return file->Write(data); 39 | } 40 | 41 | void 42 | encoder::encodeFrameSize(int64_t dataBytes, uint64_t* lenField, int64_t* padBytes) { 43 | *lenField = uint64_t(dataBytes); 44 | // force 8 byte alignment so length never gets a torn write 45 | *padBytes = (8 - (dataBytes % 8)) % 8; 46 | if (*padBytes != 0) { 47 | *lenField |= uint64_t(0x80 | *padBytes) << 56; 48 | } 49 | } 50 | 51 | }; // namespace libraft -------------------------------------------------------------------------------- /src/wal/encoder.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #ifndef __LIBRAFT_ENCODER_H__ 6 | #define __LIBRAFT_ENCODER_H__ 7 | 8 | #include 9 | #include "proto/record.pb.h" 10 | #include "base/mutex.h" 11 | 12 | using namespace std; 13 | using namespace walpb; 14 | 15 | namespace libraft { 16 | 17 | class File; 18 | 19 | struct encoder { 20 | char *buf; 21 | Locker locker; 22 | uint32_t crc; 23 | File* file; 24 | 25 | int encode(Record* rec); 26 | 27 | void encodeFrameSize(int64_t dataBytes, uint64_t* lenField, int64_t* padBytes); 28 | }; 29 | 30 | extern encoder* newEncoder(File* file, uint32_t prevCrc, int32_t pageOffset); 31 | 32 | }; // namespace libraft 33 | 34 | #endif // __LIBRAFT_ENCODER_H__ -------------------------------------------------------------------------------- /src/wal/util.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #ifndef __LIBRAFT_UTIL_H__ 6 | #define __LIBRAFT_UTIL_H__ 7 | 8 | #include 9 | #include 10 | 11 | using namespace std; 12 | 13 | namespace libraft { 14 | 15 | class FileSystemAdaptor; 16 | 17 | extern string walName(uint64_t seq, uint64_t index); 18 | 19 | extern bool readWalNames(FileSystemAdaptor*, const string& dir, vector* names); 20 | 21 | // searchIndex returns the last array index of names whose raft index section is 22 | // equal to or smaller than the given index. 23 | // The given names MUST be sorted. 24 | int searchIndex(const vector& names, uint64_t index); 25 | 26 | // names should have been sorted based on sequence number. 27 | // isValidSeq checks whether seq increases continuously. 28 | bool isValidSeq(const vector& names, int index); 29 | 30 | }; 31 | 32 | #endif /* __LIBRAFT_UTIL_H__ */ -------------------------------------------------------------------------------- /src/wal/wal.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #include "base/file_system_adaptor.h" 6 | #include "base/logger.h" 7 | #include "wal/util.h" 8 | #include "wal/wal.h" 9 | 10 | namespace libraft { 11 | 12 | static WAL* openAtIndex(walOption*, const string&, const WalSnapshot&, bool); 13 | 14 | WAL* 15 | createWAL(walOption* option, const string& dir,const string& meta) { 16 | FileSystemAdaptor* fs = option->fs; 17 | 18 | if (fs->PathExists(dir)) { 19 | Errorf("wal dir %s exists",dir.c_str()); 20 | return NULL; 21 | } 22 | 23 | // keep temporary wal directory so WAL initialization appears atomic 24 | string tmpdir = dir + ".tmp"; 25 | if (fs->DirectoryExists(tmpdir) && !fs->DeleteFile(tmpdir, true)) { 26 | Errorf("delete wal tmp dir %s fail",dir.c_str()); 27 | return NULL; 28 | } 29 | 30 | if (!fs->CreateDirectory(tmpdir)) { 31 | Errorf("create wal tmp dir %s fail",dir.c_str()); 32 | return NULL; 33 | } 34 | 35 | string file = fs->Join(tmpdir, walName(0,0)); 36 | 37 | return NULL; 38 | } 39 | 40 | WAL* 41 | OpenWAL(walOption* option, const string& dir, const WalSnapshot& snapshot) { 42 | WAL *wal = openAtIndex(option, dir, snapshot, true); 43 | if (!wal) { 44 | return NULL; 45 | } 46 | 47 | return wal; 48 | } 49 | 50 | WAL* 51 | OpenForRead(walOption* option, const string& dir, const WalSnapshot& snapshot) { 52 | return openAtIndex(option, dir, snapshot, false); 53 | } 54 | 55 | static WAL* 56 | openAtIndex(walOption* option, const string& dir, const WalSnapshot& snapshot, bool write) { 57 | vector names; 58 | 59 | if (!readWalNames(option->fs, dir, &names)) { 60 | return NULL; 61 | } 62 | 63 | int nameIndex = searchIndex(names, snapshot.index()); 64 | if (nameIndex < 0 || !isValidSeq(names, nameIndex)) { 65 | return NULL; 66 | } 67 | 68 | // open the wal files 69 | return NULL; 70 | } 71 | 72 | }; // namespace libraft -------------------------------------------------------------------------------- /src/wal/wal.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #ifndef __LIBRAFT_WAL_H__ 6 | #define __LIBRAFT_WAL_H__ 7 | 8 | #include 9 | #include "libraft.h" 10 | #include "proto/record.pb.h" 11 | 12 | using namespace std; 13 | using namespace walpb; 14 | 15 | namespace libraft { 16 | 17 | class FileSystemAdaptor; 18 | class Logger; 19 | 20 | enum walDataType { 21 | metadataType = 1, 22 | entryType, 23 | stateType, 24 | crcType, 25 | snapshotType, 26 | }; 27 | 28 | struct walOption { 29 | FileSystemAdaptor* fs; 30 | 31 | Logger *logger; 32 | }; 33 | 34 | enum walErrorCode { 35 | kErrCRCMismatch = -100, 36 | }; 37 | 38 | // WAL is a logical representation of the stable storage. 39 | // WAL is either in read mode or append mode but not both. 40 | // A newly created WAL is in append mode, and ready for appending records. 41 | // A just opened WAL is in read mode, and ready for reading records. 42 | // The WAL will be ready for appending after reading out all the previous records. 43 | struct WAL { 44 | 45 | }; 46 | 47 | // Create creates a WAL ready for appending records. The given metadata is 48 | // recorded at the head of each WAL file, and can be retrieved with ReadAll. 49 | extern WAL* CreateWAL(walOption*, const string& dir,const string& meta); 50 | 51 | // OpenWAL opens the WAL at the given snap. 52 | // The snap SHOULD have been previously saved to the WAL, or the following 53 | // ReadAll will fail. 54 | // The returned WAL is ready to read and the first record will be the one after 55 | // the given snap. The WAL cannot be appended to before reading out all of its 56 | // previous records. 57 | extern WAL* OpenWAL(walOption*, const string& dir, const WalSnapshot&); 58 | 59 | // OpenForRead only opens the wal files for read. 60 | // Write on a read only wal panics. 61 | extern WAL* OpenForRead(walOption*, const string& dir, const WalSnapshot&); 62 | 63 | }; // namespace libraft 64 | 65 | #endif // __LIBRAFT_WAL_H__ -------------------------------------------------------------------------------- /test/.gitignore: -------------------------------------------------------------------------------- 1 | all_test 2 | *.o 3 | -------------------------------------------------------------------------------- /test/crc32c_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. See the AUTHORS file for names of contributors. 4 | 5 | #include 6 | #include "base/crc32c.h" 7 | 8 | using namespace libraft; 9 | 10 | TEST(crc32cTests, StandardResults) { 11 | // From rfc3720 section B.4. 12 | char buf[32]; 13 | 14 | memset(buf, 0, sizeof(buf)); 15 | ASSERT_EQ((uint32_t)0x8a9136aa, Value(buf, sizeof(buf))); 16 | 17 | memset(buf, 0xff, sizeof(buf)); 18 | ASSERT_EQ((uint32_t)0x62a8ab43, Value(buf, sizeof(buf))); 19 | 20 | for (int i = 0; i < 32; i++) { 21 | buf[i] = i; 22 | } 23 | ASSERT_EQ((uint32_t)0x46dd794e, Value(buf, sizeof(buf))); 24 | 25 | for (int i = 0; i < 32; i++) { 26 | buf[i] = 31 - i; 27 | } 28 | ASSERT_EQ((uint32_t)0x113fdb5c, Value(buf, sizeof(buf))); 29 | 30 | unsigned char data[48] = { 31 | 0x01, 0xc0, 0x00, 0x00, 32 | 0x00, 0x00, 0x00, 0x00, 33 | 0x00, 0x00, 0x00, 0x00, 34 | 0x00, 0x00, 0x00, 0x00, 35 | 0x14, 0x00, 0x00, 0x00, 36 | 0x00, 0x00, 0x04, 0x00, 37 | 0x00, 0x00, 0x00, 0x14, 38 | 0x00, 0x00, 0x00, 0x18, 39 | 0x28, 0x00, 0x00, 0x00, 40 | 0x00, 0x00, 0x00, 0x00, 41 | 0x02, 0x00, 0x00, 0x00, 42 | 0x00, 0x00, 0x00, 0x00, 43 | }; 44 | ASSERT_EQ((uint32_t)0xd9963a56, Value(reinterpret_cast(data), sizeof(data))); 45 | } 46 | 47 | TEST(crc32cTests, Values) { 48 | ASSERT_NE(Value("a", 1), Value("foo", 3)); 49 | } 50 | 51 | TEST(crc32cTests, Extend) { 52 | ASSERT_EQ(Value("hello world", 11), 53 | Extend(Value("hello ", 6), "world", 5)); 54 | } 55 | 56 | TEST(crc32cTests, Mask) { 57 | uint32_t crc = Value("foo", 3); 58 | ASSERT_NE(crc, Mask(crc)); 59 | ASSERT_NE(crc, Mask(Mask(crc))); 60 | ASSERT_EQ(crc, Unmask(Mask(crc))); 61 | ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc))))); 62 | } 63 | -------------------------------------------------------------------------------- /test/io_buffer_test.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #include 6 | #include 7 | #include "base/io_buffer.h" 8 | 9 | using namespace libraft; 10 | 11 | TEST(iobufferTests, TestMemoryBuffer) { 12 | IOBuffer* mb = newMemoryBuffer(); 13 | 14 | mb->WriteUint64(1024); 15 | int64_t ret = 0; 16 | mb->ReadInt64(&ret); 17 | 18 | ASSERT_EQ(ret, 1024); 19 | 20 | delete mb; 21 | } 22 | 23 | TEST(iobufferTests, TestMemoryBufferWithString) { 24 | string test = string("\b\xef\xfd\x02"); 25 | IOBuffer* mb = newMemoryBufferWithString(test); 26 | char tmp[20] = {'\0'}; 27 | int err; 28 | int size = mb->ReadFull(tmp, 20, &err); 29 | 30 | ASSERT_EQ((int)test.size(), size); 31 | ASSERT_EQ(0, strncmp(tmp, test.c_str(), size)); 32 | 33 | delete mb; 34 | } 35 | -------------------------------------------------------------------------------- /test/main.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #include 6 | 7 | int main(int argc, char* argv[]) { 8 | testing::InitGoogleTest(&argc, argv); 9 | return RUN_ALL_TESTS(); 10 | } 11 | -------------------------------------------------------------------------------- /test/memory_storage_test.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #include 6 | #include "libraft.h" 7 | #include "raft_test_util.h" 8 | #include "base/logger.h" 9 | #include "base/util.h" 10 | #include "storage/memory_storage.h" 11 | 12 | using namespace libraft; 13 | 14 | TEST(memoryStorageTests, TestStorageTerm) { 15 | EntryVec entries = { 16 | initEntry(3,3), 17 | initEntry(4,4), 18 | initEntry(5,5), 19 | }; 20 | 21 | struct tmp { 22 | uint64_t i; 23 | int werr; 24 | uint64_t wterm; 25 | } tests[] = { 26 | {.i = 2, .werr = ErrCompacted, .wterm = 0}, 27 | {.i = 3, .werr = OK, .wterm = 3}, 28 | {.i = 4, .werr = OK, .wterm = 4}, 29 | {.i = 5, .werr = OK, .wterm = 5}, 30 | {.i = 6, .werr = ErrUnavailable, .wterm = 0}, 31 | }; 32 | size_t i = 0; 33 | for (i = 0; i < SIZEOF_ARRAY(tests); ++i) { 34 | const tmp &test = tests[i]; 35 | MemoryStorage s(&entries); 36 | uint64_t term; 37 | int err = s.Term(test.i, &term); 38 | EXPECT_EQ(err, test.werr) << "i: " << i; 39 | EXPECT_EQ(term, test.wterm) << "i: " << i; 40 | } 41 | } 42 | 43 | TEST(memoryStorageTests, TestStorageEntries) { 44 | EntryVec entries = { 45 | initEntry(3,3), 46 | initEntry(4,4), 47 | initEntry(5,5), 48 | initEntry(6,6), 49 | }; 50 | 51 | struct tmp { 52 | uint64_t lo, hi, maxsize; 53 | int werr; 54 | EntryVec entries; 55 | } tests[] = { 56 | { 57 | .lo = 2, .hi = 6, .maxsize = kNoLimit, .werr = ErrCompacted, 58 | .entries = {}, 59 | }, 60 | { 61 | .lo = 3, .hi = 4, .maxsize = kNoLimit, .werr = ErrCompacted, 62 | .entries = {}, 63 | }, 64 | { 65 | .lo = 4, .hi = 5, .maxsize = kNoLimit, .werr = OK, 66 | .entries = {initEntry(4,4)}, 67 | }, 68 | { 69 | .lo = 4, .hi = 6, .maxsize = kNoLimit, .werr = OK, 70 | .entries = {initEntry(4,4), initEntry(5,5)}, 71 | }, 72 | { 73 | .lo = 4, .hi = 7, .maxsize = kNoLimit, .werr = OK, 74 | .entries = {initEntry(4,4), initEntry(5,5), initEntry(6,6)}, 75 | }, 76 | // even if maxsize is zero, the first entry should be returned 77 | { 78 | .lo = 4, .hi = 7, .maxsize = 0, .werr = OK, 79 | .entries = {initEntry(4,4),}, 80 | }, 81 | // limit to 2 82 | { 83 | .lo = 4, .hi = 7, .maxsize = entries[1].ByteSizeLong() + entries[2].ByteSizeLong(), .werr = OK, 84 | .entries = {initEntry(4,4),initEntry(5,5),}, 85 | }, 86 | // limit to 2 87 | { 88 | .lo = 4, .hi = 7, .maxsize = entries[1].ByteSizeLong() + entries[2].ByteSizeLong() + entries[3].ByteSizeLong() / 2, .werr = OK, 89 | .entries = {initEntry(4,4),initEntry(5,5),}, 90 | }, 91 | { 92 | .lo = 4, .hi = 7, .maxsize = entries[1].ByteSizeLong() + entries[2].ByteSizeLong() + entries[3].ByteSizeLong() - 1, .werr = OK, 93 | .entries = {initEntry(4,4),initEntry(5,5),}, 94 | }, 95 | // all 96 | { 97 | .lo = 4, .hi = 7, .maxsize = entries[1].ByteSizeLong() + entries[2].ByteSizeLong() + entries[3].ByteSizeLong(), .werr = OK, 98 | .entries = {initEntry(4,4),initEntry(5,5),initEntry(6,6)}, 99 | }, 100 | }; 101 | 102 | size_t i = 0; 103 | for (i = 0; i < SIZEOF_ARRAY(tests); ++i) { 104 | const tmp &test = tests[i]; 105 | MemoryStorage s(&entries); 106 | EntryVec ret; 107 | 108 | int err = s.Entries(test.lo, test.hi, test.maxsize, &ret); 109 | EXPECT_EQ(err, test.werr) << "i: " << i; 110 | EXPECT_TRUE(isDeepEqualEntries(ret, test.entries)) << "i: " << i << ",ret:" << ret.size(); 111 | } 112 | } 113 | 114 | TEST(memoryStorageTests, TestStorageLastIndex) { 115 | EntryVec entries = { 116 | initEntry(3,3), 117 | initEntry(4,4), 118 | initEntry(5,5), 119 | }; 120 | 121 | MemoryStorage s(&entries); 122 | 123 | uint64_t last; 124 | int err = s.LastIndex(&last); 125 | EXPECT_EQ(OK, err); 126 | EXPECT_EQ((int)last, 5); 127 | 128 | s.Append(EntryVec({initEntry(6,5)})); 129 | 130 | err = s.LastIndex(&last); 131 | EXPECT_EQ(OK, err); 132 | EXPECT_EQ((int)last, 6); 133 | } 134 | 135 | TEST(memoryStorageTests, TestStorageFirstIndex) { 136 | EntryVec entries = { 137 | initEntry(3,3), 138 | initEntry(4,4), 139 | initEntry(5,5), 140 | }; 141 | 142 | MemoryStorage s(&entries); 143 | 144 | { 145 | uint64_t first; 146 | int err = s.FirstIndex(&first); 147 | 148 | EXPECT_EQ(OK, err); 149 | EXPECT_EQ((int)first, 4); 150 | } 151 | 152 | s.Compact(4); 153 | 154 | { 155 | uint64_t first; 156 | int err = s.FirstIndex(&first); 157 | 158 | EXPECT_EQ(OK, err); 159 | EXPECT_EQ((int)first, 5); 160 | } 161 | } 162 | 163 | TEST(memoryStorageTests, TestStorageCompact) { 164 | EntryVec entries = { 165 | initEntry(3,3), 166 | initEntry(4,4), 167 | initEntry(5,5), 168 | }; 169 | 170 | MemoryStorage s(&entries); 171 | 172 | struct tmp { 173 | uint64_t i; 174 | int werr; 175 | uint64_t wterm; 176 | uint64_t windex; 177 | int wlen; 178 | } tests[] = { 179 | { .i = 2, .werr = ErrCompacted, .wterm = 3, .windex = 3, .wlen = 3, }, 180 | { .i = 3, .werr = ErrCompacted, .wterm = 3, .windex = 3, .wlen = 3, }, 181 | { .i = 4, .werr = OK, .wterm = 4, .windex = 4, .wlen = 2, }, 182 | { .i = 5, .werr = OK, .wterm = 5, .windex = 5, .wlen = 1, }, 183 | }; 184 | 185 | size_t i = 0; 186 | for (i = 0; i < SIZEOF_ARRAY(tests); ++i) { 187 | const tmp &test = tests[i]; 188 | MemoryStorage tmp_s(&entries); 189 | 190 | int err = tmp_s.Compact(test.i); 191 | EXPECT_EQ(err, test.werr); 192 | EXPECT_EQ(tmp_s.entries_[0].index(), test.windex); 193 | EXPECT_EQ(tmp_s.entries_[0].term(), test.wterm); 194 | EXPECT_EQ((int)tmp_s.entries_.size(), test.wlen); 195 | } 196 | } 197 | 198 | static inline Snapshot 199 | initSnapshot(const string& data, uint64_t index, uint64_t term, const ConfState& cs) { 200 | Snapshot ss; 201 | SnapshotMetadata *metadata = ss.mutable_metadata(); 202 | 203 | metadata->set_index(index); 204 | metadata->set_term(term); 205 | *(metadata->mutable_conf_state()) = cs; 206 | ss.set_data(data); 207 | 208 | return ss; 209 | } 210 | 211 | TEST(memoryStorageTests, TestStorageCreateSnapshot) { 212 | EntryVec entries = { 213 | initEntry(3,3), 214 | initEntry(4,4), 215 | initEntry(5,5), 216 | }; 217 | ConfState cs; 218 | cs.add_nodes(1); 219 | cs.add_nodes(2); 220 | cs.add_nodes(3); 221 | 222 | string data = "data"; 223 | 224 | struct tmp { 225 | uint64_t i; 226 | int werr; 227 | Snapshot wsnap; 228 | } tests[] = { 229 | { .i = 4, .werr = OK, .wsnap = initSnapshot(data, 4, 4, cs), }, 230 | { .i = 5, .werr = OK, .wsnap = initSnapshot(data, 5, 5, cs), }, 231 | }; 232 | 233 | size_t i = 0; 234 | for (i = 0; i < SIZEOF_ARRAY(tests); ++i) { 235 | const tmp &test = tests[i]; 236 | MemoryStorage tmp_s(&entries); 237 | Snapshot ss; 238 | 239 | int err = tmp_s.CreateSnapshot(test.i, &cs, data, &ss); 240 | EXPECT_EQ(err, test.werr); 241 | EXPECT_TRUE(isDeepEqualSnapshot(&ss, &test.wsnap)); 242 | } 243 | } 244 | 245 | TEST(memoryStorageTests, TestStorageAppend) { 246 | EntryVec entries = { 247 | initEntry(3,3), 248 | initEntry(4,4), 249 | initEntry(5,5), 250 | }; 251 | 252 | struct tmp { 253 | EntryVec entries; 254 | int werr; 255 | EntryVec wentries; 256 | } tests[] = { 257 | { 258 | .entries = {initEntry(1,1), initEntry(2,2), }, 259 | .werr = OK, 260 | .wentries = {initEntry(3,3), initEntry(4,4), initEntry(5,5)}, 261 | }, 262 | { 263 | .entries = {initEntry(3,3), initEntry(4,4), initEntry(5,5)}, 264 | .werr = OK, 265 | .wentries = {initEntry(3,3), initEntry(4,4), initEntry(5,5)}, 266 | }, 267 | { 268 | .entries = {initEntry(3,3), initEntry(4,6), initEntry(5,6)}, 269 | .werr = OK, 270 | .wentries = {initEntry(3,3), initEntry(4,6), initEntry(5,6)}, 271 | }, 272 | { 273 | .entries = {initEntry(3,3), initEntry(4,4), initEntry(5,5), initEntry(6,5)}, 274 | .werr = OK, 275 | .wentries = {initEntry(3,3), initEntry(4,4), initEntry(5,5), initEntry(6,5)}, 276 | }, 277 | // truncate incoming entries, truncate the existing entries and append 278 | { 279 | .entries = {initEntry(2,3), initEntry(3,3), initEntry(4,5)}, 280 | .werr = OK, 281 | .wentries = {initEntry(3,3), initEntry(4,5),}, 282 | }, 283 | // truncate the existing entries and append 284 | { 285 | .entries = {initEntry(4,5)}, 286 | .werr = OK, 287 | .wentries = {initEntry(3,3), initEntry(4,5),}, 288 | }, 289 | // direct append 290 | { 291 | .entries = {initEntry(6,5)}, 292 | .werr = OK, 293 | .wentries = {initEntry(3,3), initEntry(4,4),initEntry(5,5),initEntry(6,5),}, 294 | }, 295 | }; 296 | 297 | size_t i = 0; 298 | for (i = 0; i < SIZEOF_ARRAY(tests); ++i) { 299 | const tmp &test = tests[i]; 300 | MemoryStorage tmp_s(&entries); 301 | 302 | int err = tmp_s.Append(test.entries); 303 | EXPECT_EQ(err, test.werr); 304 | EXPECT_TRUE(isDeepEqualEntries(test.wentries, tmp_s.entries_)) << "i: " << i << ",diff:" << entryVecDebugString(test.wentries) << " to " << entryVecDebugString(tmp_s.entries_); 305 | } 306 | } 307 | 308 | TEST(memoryStorageTests, TestStorageApplySnapshot) { 309 | ConfState cs; 310 | cs.add_nodes(1); 311 | cs.add_nodes(2); 312 | cs.add_nodes(3); 313 | 314 | string data = "data"; 315 | 316 | Snapshot tests[] = { 317 | initSnapshot(data, 4, 4, cs), 318 | initSnapshot(data, 3, 3, cs), 319 | }; 320 | 321 | MemoryStorage s(NULL); 322 | 323 | { 324 | //Apply Snapshot successful 325 | Snapshot tt = tests[0]; 326 | EXPECT_EQ(OK, s.ApplySnapshot(tt)); 327 | } 328 | 329 | { 330 | //Apply Snapshot fails due to ErrSnapOutOfDate 331 | Snapshot tt = tests[1]; 332 | EXPECT_EQ(ErrSnapOutOfDate, s.ApplySnapshot(tt)); 333 | } 334 | } -------------------------------------------------------------------------------- /test/node_test.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #include 6 | #include 7 | #include "libraft.h" 8 | #include "base/util.h" 9 | #include "core/raft.h" 10 | #include "core/progress.h" 11 | #include "core/read_only.h" 12 | #include "core/node.h" 13 | #include "storage/memory_storage.h" 14 | #include "raft_test_util.h" 15 | 16 | using namespace libraft; 17 | 18 | vector msgs; 19 | 20 | static void appendStep(raft *, const Message &msg) { 21 | msgs.push_back(Message(msg)); 22 | } 23 | 24 | //TODO 25 | TEST(nodeTests, TestNodeStep) { 26 | } 27 | 28 | // TODO 29 | // Cancel and Stop should unblock Step() 30 | TEST(nodeTests, TestNodeStepUnblock) { 31 | } 32 | 33 | // TestNodePropose ensures that node.Propose sends the given proposal to the underlying raft. 34 | TEST(nodeTests, TestNodePropose) { 35 | msgs.clear(); 36 | vector readStates; 37 | 38 | MemoryStorage *s = new MemoryStorage(NULL); 39 | vector peers; 40 | peers.push_back(1); 41 | raft *r = newTestRaft(1, peers, 10, 1, s); 42 | Config config; 43 | NodeImpl *n = new NodeImpl(r, &config); 44 | 45 | readStates.push_back(new ReadState(1, "somedata")); 46 | r->readStates_ = readStates; 47 | 48 | n->Campaign(); 49 | Ready *ready = n->get_ready(); 50 | 51 | while (true) { 52 | EXPECT_EQ(ready->readStates, readStates); 53 | 54 | s->Append(ready->entries); 55 | 56 | if (ready->softState.leader == r->id_) { 57 | n->Advance(); 58 | break; 59 | } 60 | n->Advance(); 61 | } 62 | 63 | r->stateStepFunc_ = appendStep; 64 | string wrequestCtx = "somedata2"; 65 | n->ReadIndex(wrequestCtx); 66 | 67 | EXPECT_EQ((int)msgs.size(), 1); 68 | EXPECT_EQ(msgs[0].type(), MsgReadIndex); 69 | EXPECT_EQ(msgs[0].entries(0).data(), wrequestCtx); 70 | 71 | delete n; 72 | } 73 | 74 | // TestNodeReadIndexToOldLeader ensures that raftpb.MsgReadIndex to old leader 75 | // gets forwarded to the new leader and 'send' method does not attach its term. 76 | TEST(nodeTests, TestNodeReadIndexToOldLeader) { 77 | vector peers = {1,2,3}; 78 | vector sts; 79 | vector storages; 80 | raft *a, *b, *c; 81 | 82 | { 83 | MemoryStorage *s = new MemoryStorage(NULL); 84 | 85 | a = newTestRaft(1, peers, 10, 1, s); 86 | storages.push_back(s); 87 | sts.push_back(new raftStateMachine(a)); 88 | } 89 | { 90 | MemoryStorage *s = new MemoryStorage(NULL); 91 | 92 | b = newTestRaft(2, peers, 10, 1, s); 93 | storages.push_back(s); 94 | sts.push_back(new raftStateMachine(b)); 95 | } 96 | { 97 | MemoryStorage *s = new MemoryStorage(NULL); 98 | 99 | c = newTestRaft(3, peers, 10, 1, s); 100 | storages.push_back(s); 101 | sts.push_back(new raftStateMachine(c)); 102 | } 103 | 104 | network *net = newNetwork(sts); 105 | 106 | // elect a as leader 107 | { 108 | vector tmp_msgs = { initMessage(1,1,MsgHup) }; 109 | net->send(&tmp_msgs); 110 | } 111 | 112 | EntryVec testEntries = {initEntry(0,0,"testdata")}; 113 | 114 | // send readindex request to b(follower) 115 | b->step(initMessage(2,2,MsgReadIndex,&testEntries)); 116 | 117 | // verify b(follower) forwards this message to r1(leader) with term not set 118 | EXPECT_EQ((int)b->outMsgs_.size(), 1); 119 | 120 | Message readIndexMsg1 = initMessage(2,1,MsgReadIndex, &testEntries); 121 | 122 | EXPECT_TRUE(isDeepEqualMessage(*b->outMsgs_[0], readIndexMsg1)); 123 | 124 | // send readindex request to c(follower) 125 | c->step(initMessage(3,3,MsgReadIndex,&testEntries)); 126 | 127 | // verify c(follower) forwards this message to r1(leader) with term not set 128 | EXPECT_EQ((int)c->outMsgs_.size(), 1); 129 | Message readIndexMsg2 = initMessage(3,1,MsgReadIndex, &testEntries); 130 | EXPECT_TRUE(isDeepEqualMessage(*c->outMsgs_[0], readIndexMsg2)); 131 | 132 | // now elect c as leader 133 | { 134 | vector tmp_msgs = {initMessage(3,3,MsgHup)}; 135 | net->send(&tmp_msgs); 136 | } 137 | 138 | // let a steps the two messages previously we got from b, c 139 | a->step(readIndexMsg1); 140 | a->step(readIndexMsg2); 141 | 142 | // verify a(follower) forwards these messages again to c(new leader) 143 | EXPECT_EQ((int)a->outMsgs_.size(), 2); 144 | 145 | Message readIndexMsg3 = initMessage(1,3,MsgReadIndex, &testEntries); 146 | EXPECT_TRUE(isDeepEqualMessage(*a->outMsgs_[0], readIndexMsg3)); 147 | EXPECT_TRUE(isDeepEqualMessage(*a->outMsgs_[1], readIndexMsg3)); 148 | 149 | delete net; 150 | } 151 | 152 | // TestNodeProposeConfig ensures that node.ProposeConfChange sends the given configuration proposal 153 | // to the underlying raft. 154 | TEST(nodeTests, TestNodeProposeConfig) { 155 | msgs.clear(); 156 | 157 | MemoryStorage *s = new MemoryStorage(NULL); 158 | vector peers = {1}; 159 | raft *r = newTestRaft(1, peers, 10, 1, s); 160 | Config config; 161 | NodeImpl *n = new NodeImpl(r, &config); 162 | 163 | n->Campaign(); 164 | Ready *ready = n->get_ready(); 165 | 166 | while (true) { 167 | s->Append(ready->entries); 168 | 169 | // change the step function to appendStep until this raft becomes leader 170 | if (ready->softState.leader == r->id_) { 171 | r->stateStepFunc_ = appendStep; 172 | n->Advance(); 173 | break; 174 | } 175 | n->Advance(); 176 | } 177 | 178 | ConfChange cc; 179 | cc.set_type(ConfChangeAddNode); 180 | cc.set_nodeid(1); 181 | string ccdata; 182 | cc.SerializeToString(&ccdata); 183 | 184 | n->ProposeConfChange(cc); 185 | ready = n->get_ready(); 186 | n->Stop(); 187 | 188 | EXPECT_EQ((int)msgs.size(), 1); 189 | EXPECT_EQ(msgs[0].type(), MsgProp); 190 | EXPECT_EQ(msgs[0].entries(0).data(), ccdata); 191 | 192 | delete n; 193 | } 194 | 195 | // TestNodeProposeAddDuplicateNode ensures that two proposes to add the same node should 196 | // not affect the later propose to add new node. 197 | void applyReadyEntries(Ready* ready, EntryVec* readyEntries, MemoryStorage *s, NodeImpl *n) { 198 | if (ready == NULL) { 199 | n->Advance(); 200 | return; 201 | } 202 | size_t i; 203 | s->Append(ready->entries); 204 | Ready *nready; 205 | for (i = 0; i < ready->entries.size(); ++i) { 206 | const Entry& entry = ready->entries[i]; 207 | 208 | readyEntries->push_back(entry); 209 | if (entry.type() == EntryNormal || entry.type() == EntryConfChange) { 210 | ConfChange cc; 211 | cc.ParseFromString(entry.data()); 212 | ConfState cs; 213 | n->ApplyConfChange(cc, &cs); 214 | nready = n->get_ready(); 215 | //applyReadyEntries(nready, readyEntries, s, n); 216 | } 217 | } 218 | n->Advance(); 219 | } 220 | 221 | TEST(nodeTests, TestNodeProposeAddDuplicateNode) { 222 | MemoryStorage *s = new MemoryStorage(NULL); 223 | vector peers; 224 | peers.push_back(1); 225 | raft *r = newTestRaft(1, peers, 10, 1, s); 226 | Config config; 227 | NodeImpl *n = new NodeImpl(r, &config); 228 | 229 | EntryVec readyEntries; 230 | n->Campaign(); 231 | Ready *ready = n->get_ready(); 232 | applyReadyEntries(ready, &readyEntries, s, n); 233 | 234 | ConfChange cc1, cc2; 235 | string ccdata1, ccdata2; 236 | 237 | cc1.set_type(ConfChangeAddNode); 238 | cc1.set_nodeid(1); 239 | cc1.SerializeToString(&ccdata1); 240 | n->ProposeConfChange(cc1); 241 | ready = n->get_ready(); 242 | applyReadyEntries(ready, &readyEntries, s, n); 243 | 244 | // try add the same node again 245 | n->ProposeConfChange(cc1); 246 | ready = n->get_ready(); 247 | applyReadyEntries(ready, &readyEntries, s, n); 248 | 249 | // the new node join should be ok 250 | cc2.set_type(ConfChangeAddNode); 251 | cc2.set_nodeid(2); 252 | cc2.SerializeToString(&ccdata2); 253 | n->ProposeConfChange(cc2); 254 | ready = n->get_ready(); 255 | applyReadyEntries(ready, &readyEntries, s, n); 256 | 257 | EXPECT_EQ((int)readyEntries.size(), 4); 258 | EXPECT_EQ(readyEntries[1].data(), ccdata1); 259 | EXPECT_EQ(readyEntries[3].data(), ccdata2); 260 | 261 | delete n; 262 | } 263 | -------------------------------------------------------------------------------- /test/progress_test.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #include 6 | #include "libraft.h" 7 | #include "base/util.h" 8 | #include "core/progress.h" 9 | 10 | using namespace libraft; 11 | 12 | bool deepEqualInflights(const inflights& in1, const inflights& in2) { 13 | EXPECT_EQ(in1.start_, in2.start_); 14 | EXPECT_EQ(in1.count_, in2.count_); 15 | EXPECT_EQ(in1.size_, in2.size_); 16 | EXPECT_EQ(in1.buffer_.size(), in2.buffer_.size()); 17 | size_t i = 0; 18 | for (i = 0; i < in1.buffer_.size(); ++i) { 19 | EXPECT_EQ(in1.buffer_[i], in2.buffer_[i]) << "i: " << i << ",in1:" << in1.buffer_[i] << ",in2:" << in2.buffer_[i]; 20 | } 21 | 22 | return true; 23 | } 24 | 25 | TEST(progressTests, TestInflightsAdd) { 26 | inflights ins(10); 27 | int i; 28 | 29 | for (i = 0; i < 5; ++i) { 30 | ins.add(i); 31 | } 32 | 33 | { 34 | inflights wantIns(10); 35 | wantIns.start_ = 0; 36 | wantIns.count_ = 5; 37 | wantIns.size_ = 10; 38 | wantIns.buffer_ = vector{0,1,2,3,4,0,0,0,0,0}; 39 | // ↓------------ 40 | // 0, 1, 2, 3, 4, 0, 0, 0, 0, 0 41 | EXPECT_EQ(true, deepEqualInflights(ins, wantIns)); 42 | } 43 | 44 | for (i = 5; i < 10; ++i) { 45 | ins.add(i); 46 | } 47 | 48 | { 49 | inflights wantIns(10); 50 | wantIns.start_ = 0; 51 | wantIns.count_ = 10; 52 | wantIns.size_ = 10; 53 | wantIns.buffer_ = vector{0,1,2,3,4,5,6,7,8,9}; 54 | // ↓-------------------------- 55 | // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 56 | EXPECT_EQ(true, deepEqualInflights(ins, wantIns)); 57 | } 58 | 59 | // rotating case 60 | inflights ins2(10); 61 | ins2.start_ = 5; 62 | 63 | for (i = 0; i < 5; ++i) { 64 | ins2.add(i); 65 | } 66 | { 67 | inflights wantIns(10); 68 | wantIns.start_ = 5; 69 | wantIns.count_ = 5; 70 | wantIns.size_ = 10; 71 | wantIns.buffer_ = vector{0, 0, 0, 0, 0, 0, 1, 2, 3, 4}; 72 | // ↓------------ 73 | // 0, 0, 0, 0, 0, 0, 1, 2, 3, 4 74 | EXPECT_EQ(true, deepEqualInflights(ins2, wantIns)); 75 | } 76 | for (i = 5; i < 10; ++i) { 77 | ins2.add(i); 78 | } 79 | { 80 | inflights wantIns(10); 81 | wantIns.start_ = 5; 82 | wantIns.count_ = 10; 83 | wantIns.size_ = 10; 84 | wantIns.buffer_ = vector{5, 6, 7, 8, 9, 0, 1, 2, 3, 4}; 85 | // ---------------↓------------ 86 | // 5, 6, 7, 8, 9, 0, 1, 2, 3, 4 87 | EXPECT_EQ(true, deepEqualInflights(ins2, wantIns)); 88 | } 89 | } 90 | 91 | TEST(progressTests, TestInflightFreeTo) { 92 | inflights ins(10); 93 | int i; 94 | 95 | for (i = 0; i < 10; ++i) { 96 | ins.add(i); 97 | } 98 | 99 | ins.freeTo(4); 100 | { 101 | inflights wantIns(10); 102 | wantIns.start_ = 5; 103 | wantIns.count_ = 5; 104 | wantIns.size_ = 10; 105 | wantIns.buffer_ = vector{0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; 106 | // ↓------------ 107 | // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 108 | EXPECT_EQ(true, deepEqualInflights(ins, wantIns)); 109 | } 110 | 111 | ins.freeTo(8); 112 | { 113 | inflights wantIns(10); 114 | wantIns.start_ = 9; 115 | wantIns.count_ = 1; 116 | wantIns.size_ = 10; 117 | wantIns.buffer_ = vector{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; 118 | // ↓ 119 | // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 120 | EXPECT_EQ(true, deepEqualInflights(ins, wantIns)); 121 | } 122 | 123 | // rotating case 124 | for (i = 10; i < 15; ++i) { 125 | ins.add(i); 126 | } 127 | ins.freeTo(12); 128 | { 129 | inflights wantIns(10); 130 | wantIns.start_ = 3; 131 | wantIns.count_ = 2; 132 | wantIns.size_ = 10; 133 | wantIns.buffer_ = vector{10, 11, 12, 13, 14, 5, 6, 7, 8, 9 }; 134 | // ↓---- 135 | // 10, 11, 12, 13, 14, 5, 6, 7, 8, 9 136 | EXPECT_EQ(true, deepEqualInflights(ins, wantIns)); 137 | } 138 | 139 | ins.freeTo(14); 140 | { 141 | inflights wantIns(10); 142 | wantIns.start_ = 0; 143 | wantIns.count_ = 0; 144 | wantIns.size_ = 10; 145 | wantIns.buffer_ = vector{10, 11, 12, 13, 14, 5, 6, 7, 8, 9 }; 146 | EXPECT_EQ(true, deepEqualInflights(ins, wantIns)); 147 | } 148 | } 149 | 150 | TEST(progressTests, TestInflightFreeFirstOne) { 151 | inflights ins(10); 152 | int i; 153 | 154 | for (i = 0; i < 10; ++i) { 155 | ins.add(i); 156 | } 157 | 158 | ins.freeFirstOne(); 159 | { 160 | inflights wantIns(10); 161 | wantIns.start_ = 1; 162 | wantIns.count_ = 9; 163 | wantIns.size_ = 10; 164 | wantIns.buffer_ = vector{0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; 165 | // ↓----------------------- 166 | // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 167 | EXPECT_EQ(true, deepEqualInflights(ins, wantIns)); 168 | } 169 | } 170 | -------------------------------------------------------------------------------- /test/raft_flow_controller_test.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #include 6 | #include 7 | #include "libraft.h" 8 | #include "raft_test_util.h" 9 | #include "core/raft.h" 10 | #include "base/util.h" 11 | #include "core/progress.h" 12 | #include "core/read_only.h" 13 | #include "storage/memory_storage.h" 14 | 15 | using namespace libraft; 16 | 17 | // TestMsgAppFlowControlFull ensures: 18 | // 1. msgApp can fill the sending window until full 19 | // 2. when the window is full, no more msgApp can be sent. 20 | TEST(raftFlowController, TestMsgAppFlowControlFull) { 21 | vector peers = {1,2}; 22 | Storage *s = new MemoryStorage(NULL); 23 | raft *r = newTestRaft(1, peers, 5, 1, s); 24 | r->becomeCandidate(); 25 | r->becomeLeader(); 26 | Progress *pr2 = r->progressMap_[2]; 27 | 28 | // force the progress to be in replicate state 29 | pr2->becomeReplicate(); 30 | // fill in the inflights window 31 | int i; 32 | for (i = 0; i < r->maxInfilght_; ++i) { 33 | { 34 | EntryVec entries = {initEntry(0,0,"somedata")}; 35 | Message msg = initMessage(1,1,MsgProp,&entries); 36 | 37 | r->step(msg); 38 | } 39 | 40 | MessageVec msgs; 41 | r->readMessages(&msgs); 42 | EXPECT_EQ((int)msgs.size(), 1); 43 | } 44 | 45 | // ensure 1 46 | EXPECT_TRUE(pr2->inflights_.full()); 47 | 48 | // ensure 2 49 | for (i = 0; i < 10; ++i) { 50 | { 51 | EntryVec entries = {initEntry(0,0,"somedata")}; 52 | Message msg = initMessage(1,1,MsgProp,&entries); 53 | r->step(msg); 54 | } 55 | 56 | MessageVec msgs; 57 | r->readMessages(&msgs); 58 | EXPECT_EQ((int)msgs.size(), 0); 59 | } 60 | 61 | delete r; 62 | } 63 | 64 | // TestMsgAppFlowControlMoveForward ensures msgAppResp can move 65 | // forward the sending window correctly: 66 | // 1. valid msgAppResp.index moves the windows to pass all smaller or equal index. 67 | // 2. out-of-dated msgAppResp has no effect on the sliding window. 68 | TEST(raftFlowController, TestMsgAppFlowControlMoveForward) { 69 | vector peers = {1,2}; 70 | Storage *s = new MemoryStorage(NULL); 71 | raft *r = newTestRaft(1, peers, 5, 1, s); 72 | r->becomeCandidate(); 73 | r->becomeLeader(); 74 | Progress *pr2 = r->progressMap_[2]; 75 | 76 | // force the progress to be in replicate state 77 | pr2->becomeReplicate(); 78 | // fill in the inflights window 79 | int i; 80 | for (i = 0; i < r->maxInfilght_; ++i) { 81 | { 82 | EntryVec entries = {initEntry(0,0,"somedata")}; 83 | Message msg = initMessage(1,1,MsgProp,&entries); 84 | r->step(msg); 85 | } 86 | 87 | MessageVec msgs; 88 | r->readMessages(&msgs); 89 | } 90 | 91 | // 1 is noop, 2 is the first proposal we just sent. 92 | // so we start with 2. 93 | for (i = 2; i < r->maxInfilght_; ++i) { 94 | { 95 | Message msg = initMessage(2,1,MsgAppResp, NULL,i); 96 | 97 | r->step(msg); 98 | } 99 | 100 | MessageVec msgs; 101 | r->readMessages(&msgs); 102 | 103 | // fill in the inflights window again 104 | { 105 | EntryVec entries = {initEntry(0,0,"somedata")}; 106 | Message msg = initMessage(1,1,MsgProp,&entries); 107 | r->step(msg); 108 | } 109 | r->readMessages(&msgs); 110 | EXPECT_EQ((int)msgs.size(), 1); 111 | 112 | // ensure 1 113 | EXPECT_TRUE(pr2->inflights_.full()); 114 | 115 | // ensure 2 116 | int j; 117 | for (j = 0; j < i; ++j) { 118 | { 119 | Message msg = initMessage(2,1,MsgAppResp,NULL,j); 120 | r->step(msg); 121 | } 122 | 123 | EXPECT_TRUE(pr2->inflights_.full()); 124 | } 125 | } 126 | 127 | delete r; 128 | } 129 | 130 | // TestMsgAppFlowControlRecvHeartbeat ensures a heartbeat response 131 | // frees one slot if the window is full. 132 | TEST(raftFlowController, TestMsgAppFlowControlRecvHeartbeat) { 133 | vector peers = {1,2}; 134 | Storage *s = new MemoryStorage(NULL); 135 | raft *r = newTestRaft(1, peers, 5, 1, s); 136 | r->becomeCandidate(); 137 | r->becomeLeader(); 138 | Progress *pr2 = r->progressMap_[2]; 139 | 140 | // force the progress to be in replicate state 141 | pr2->becomeReplicate(); 142 | // fill in the inflights window 143 | int i; 144 | for (i = 0; i < r->maxInfilght_; ++i) { 145 | { 146 | EntryVec entries = {initEntry(0,0,"somedata")}; 147 | Message msg = initMessage(1,1,MsgProp,&entries); 148 | r->step(msg); 149 | } 150 | 151 | MessageVec msgs; 152 | r->readMessages(&msgs); 153 | } 154 | 155 | for (i = 1; i < 5; ++i) { 156 | EXPECT_TRUE(pr2->inflights_.full()); 157 | 158 | // recv tt msgHeartbeatResp and expect one free slot 159 | int j; 160 | for (j = 0; j < i; ++j) { 161 | { 162 | Message msg = initMessage(2,1,MsgHeartbeatResp,NULL); 163 | r->step(msg); 164 | } 165 | MessageVec msgs; 166 | r->readMessages(&msgs); 167 | EXPECT_FALSE(pr2->inflights_.full()); 168 | } 169 | 170 | // one slot 171 | { 172 | EntryVec entries = {initEntry(0,0,"somedata")}; 173 | Message msg = initMessage(1,1,MsgProp,&entries); 174 | 175 | r->step(msg); 176 | MessageVec msgs; 177 | r->readMessages(&msgs); 178 | EXPECT_EQ((int)msgs.size(), 1); 179 | } 180 | 181 | // and just one slot 182 | for (j = 0; j < 10; ++j) { 183 | { 184 | EntryVec entries = {initEntry(0,0,"somedata")}; 185 | Message msg = initMessage(1,1,MsgProp,&entries); 186 | r->step(msg); 187 | } 188 | MessageVec msgs; 189 | r->readMessages(&msgs); 190 | EXPECT_EQ((int)msgs.size(), 0); 191 | } 192 | 193 | // clear all pending messages. 194 | { 195 | Message msg = initMessage(2,1,MsgHeartbeatResp,NULL); 196 | 197 | r->step(msg); 198 | } 199 | MessageVec msgs; 200 | r->readMessages(&msgs); 201 | } 202 | 203 | delete r; 204 | } 205 | -------------------------------------------------------------------------------- /test/raft_snap_test.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #include 6 | #include 7 | #include "libraft.h" 8 | #include "raft_test_util.h" 9 | #include "base/util.h" 10 | #include "core/progress.h" 11 | #include "core/raft.h" 12 | #include "core/read_only.h" 13 | #include "storage/memory_storage.h" 14 | 15 | using namespace libraft; 16 | 17 | Snapshot testingSnap() { 18 | Snapshot ts; 19 | ts.mutable_metadata()->set_index(11); 20 | ts.mutable_metadata()->set_term(11); 21 | ts.mutable_metadata()->mutable_conf_state()->add_nodes(1); 22 | ts.mutable_metadata()->mutable_conf_state()->add_nodes(2); 23 | 24 | return ts; 25 | } 26 | 27 | TEST(raftPaperTests, TestSendingSnapshotSetPendingSnapshot) { 28 | vector peers = {1}; 29 | Storage *s = new MemoryStorage(NULL); 30 | raft *r = newTestRaft(1, peers, 10, 1, s); 31 | r->restore(testingSnap()); 32 | r->becomeCandidate(); 33 | r->becomeLeader(); 34 | 35 | // force set the next of node 1, so that 36 | // node 1 needs a snapshot 37 | r->progressMap_[2]->next_ = r->raftLog_->firstIndex(); 38 | 39 | { 40 | Message msg = initMessage(2,1,MsgAppResp,NULL, r->progressMap_[2]->next_ - 1); 41 | msg.set_reject(true); 42 | 43 | r->step(msg); 44 | } 45 | 46 | EXPECT_EQ((int)r->progressMap_[2]->pendingSnapshot_, 11); 47 | 48 | delete r; 49 | } 50 | 51 | TEST(raftPaperTests, TestPendingSnapshotPauseReplication) { 52 | vector peers = {1,2}; 53 | Storage *s = new MemoryStorage(NULL); 54 | raft *r = newTestRaft(1, peers, 10, 1, s); 55 | r->restore(testingSnap()); 56 | r->becomeCandidate(); 57 | r->becomeLeader(); 58 | 59 | r->progressMap_[2]->becomeSnapshot(11); 60 | 61 | { 62 | EntryVec entries = {initEntry(0,0,"somedata")}; 63 | 64 | Message msg = initMessage(1,1,MsgProp,&entries); 65 | r->step(msg); 66 | } 67 | 68 | MessageVec msgs; 69 | r->readMessages(&msgs); 70 | EXPECT_EQ((int)msgs.size(), 0); 71 | 72 | delete r; 73 | } 74 | 75 | TEST(raftPaperTests, TestSnapshotFailure) { 76 | vector peers = {1,2}; 77 | Storage *s = new MemoryStorage(NULL); 78 | raft *r = newTestRaft(1, peers, 10, 1, s); 79 | r->restore(testingSnap()); 80 | r->becomeCandidate(); 81 | r->becomeLeader(); 82 | 83 | r->progressMap_[2]->next_ = 1; 84 | r->progressMap_[2]->becomeSnapshot(11); 85 | 86 | { 87 | Message msg = initMessage(2,1,MsgSnapStatus,NULL); 88 | msg.set_reject(true); 89 | r->step(msg); 90 | } 91 | 92 | EXPECT_EQ((int)r->progressMap_[2]->pendingSnapshot_, 0); 93 | EXPECT_EQ((int)r->progressMap_[2]->next_, 1); 94 | EXPECT_TRUE(r->progressMap_[2]->paused_); 95 | 96 | delete r; 97 | } 98 | 99 | TEST(raftPaperTests, TestSnapshotSucceed) { 100 | vector peers = {1,2}; 101 | Storage *s = new MemoryStorage(NULL); 102 | raft *r = newTestRaft(1, peers, 10, 1, s); 103 | r->restore(testingSnap()); 104 | r->becomeCandidate(); 105 | r->becomeLeader(); 106 | 107 | r->progressMap_[2]->next_ = 1; 108 | r->progressMap_[2]->becomeSnapshot(11); 109 | 110 | { 111 | Message msg = initMessage(2,1,MsgSnapStatus,NULL); 112 | msg.set_reject(false); 113 | r->step(msg); 114 | } 115 | 116 | EXPECT_EQ((int)r->progressMap_[2]->pendingSnapshot_, 0); 117 | EXPECT_EQ((int)r->progressMap_[2]->next_, 12); 118 | EXPECT_TRUE(r->progressMap_[2]->paused_); 119 | 120 | delete r; 121 | } 122 | 123 | TEST(raftPaperTests, TestSnapshotAbort) { 124 | vector peers = {1,2}; 125 | Storage *s = new MemoryStorage(NULL); 126 | raft *r = newTestRaft(1, peers, 10, 1, s); 127 | r->restore(testingSnap()); 128 | r->becomeCandidate(); 129 | r->becomeLeader(); 130 | 131 | r->progressMap_[2]->next_ = 1; 132 | r->progressMap_[2]->becomeSnapshot(11); 133 | 134 | // A successful msgAppResp that has a higher/equal index than the 135 | // pending snapshot should abort the pending snapshot. 136 | { 137 | Message msg = initMessage(2,1,MsgAppResp,NULL,11); 138 | r->step(msg); 139 | } 140 | 141 | EXPECT_EQ((int)r->progressMap_[2]->pendingSnapshot_, 0); 142 | EXPECT_EQ((int)r->progressMap_[2]->next_, 12); 143 | 144 | delete r; 145 | } 146 | -------------------------------------------------------------------------------- /test/raft_test_util.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #include 6 | #include 7 | #include "raft_test_util.h" 8 | #include "base/util.h" 9 | #include "core/raft.h" 10 | 11 | using namespace libraft; 12 | extern stateMachine *nopStepper; 13 | 14 | // nextEnts returns the appliable entries and updates the applied index 15 | void nextEnts(raft *r, MemoryStorage *s, EntryVec *entries) { 16 | entries->clear(); 17 | // Transfer all unstable entries to "stable" storage. 18 | EntryVec tmp; 19 | r->raftLog_->unstableEntries(&tmp); 20 | s->Append(tmp); 21 | r->raftLog_->stableTo(r->raftLog_->lastIndex(), r->raftLog_->lastTerm()); 22 | 23 | r->raftLog_->nextEntries(entries); 24 | r->raftLog_->appliedTo(r->raftLog_->committed_); 25 | } 26 | 27 | string raftLogString(raftLog *log) { 28 | char buf[1024] = {'\0'}; 29 | string str = ""; 30 | 31 | snprintf(buf, sizeof(buf), "committed: %llu\n", log->committed_); 32 | str += buf; 33 | 34 | snprintf(buf, sizeof(buf), "applied: %llu\n", log->applied_); 35 | str += buf; 36 | 37 | EntryVec entries; 38 | log->allEntries(&entries); 39 | 40 | snprintf(buf, sizeof(buf), "entries size: %lu\n", entries.size()); 41 | str += buf; 42 | 43 | size_t i; 44 | for (i = 0; i < entries.size(); ++i) { 45 | str += entryString(entries[i]); 46 | } 47 | 48 | return str; 49 | } 50 | 51 | bool operator < (const connem& c1, const connem& c2) { 52 | if (c1.from < c2.from) { 53 | return true; 54 | } 55 | if (c1.from == c2.from) { 56 | return c1.to < c2.to; 57 | } 58 | return false; 59 | } 60 | 61 | raftStateMachine::raftStateMachine(Config *c) { 62 | config = c; 63 | raft = newRaft(c); 64 | } 65 | 66 | raftStateMachine::raftStateMachine(struct raft *r) 67 | : config(NULL), 68 | raft(r) { 69 | } 70 | 71 | raftStateMachine::~raftStateMachine() { 72 | if (config) { 73 | delete config; 74 | } 75 | 76 | delete raft; 77 | } 78 | 79 | int raftStateMachine::step(const Message& msg) { 80 | return raft->step(msg); 81 | } 82 | 83 | void raftStateMachine::readMessages(MessageVec *msgs) { 84 | raft->readMessages(msgs); 85 | } 86 | 87 | void idsBySize(int size, vector* ids) { 88 | int i = 0; 89 | for (i = 0; i < size; ++i) { 90 | ids->push_back(1 + i); 91 | } 92 | } 93 | 94 | // newNetworkWithConfig is like newNetwork but calls the given func to 95 | // modify the configuration of any state machines it creates. 96 | network* newNetworkWithConfig(ConfigFun fun, vector peers) { 97 | srand(time(NULL)); 98 | int size = peers.size(); 99 | vector peerAddrs; 100 | idsBySize(size, &peerAddrs); 101 | network *net = new network(); 102 | MemoryStorage *s; 103 | Config *c; 104 | raftStateMachine *r; 105 | raft *rf; 106 | 107 | int i, j; 108 | for (i = 0; i < size; ++i) { 109 | stateMachine *p = peers[i]; 110 | uint64_t id = peerAddrs[i]; 111 | 112 | if (!p) { 113 | s = new MemoryStorage(NULL); 114 | net->storage[id] = s; 115 | c = newTestConfig(id, peerAddrs, 10, 1, s); 116 | if (fun) { 117 | fun(c); 118 | } 119 | r = new raftStateMachine(c); 120 | net->peers[id] = r; 121 | continue; 122 | } 123 | 124 | switch (p->type()) { 125 | case raftType: 126 | rf = (raft *)p->data(); 127 | rf->id_ = id; 128 | for (j = 0; j < size; ++j) { 129 | rf->progressMap_[peerAddrs[j]] = new Progress(0, 256); 130 | } 131 | rf->reset(rf->term_); 132 | net->peers[id] = p; 133 | break; 134 | case blackHoleType: 135 | net->peers[id] = p; 136 | break; 137 | } 138 | } 139 | 140 | return net; 141 | } 142 | 143 | network* newNetwork(const vector& peers) { 144 | return newNetworkWithConfig(NULL, peers); 145 | } 146 | 147 | network::~network() { 148 | { 149 | map::iterator iter = peers.begin(); 150 | while (iter != peers.end()) { 151 | if (iter->second != nopStepper) { 152 | delete iter->second; 153 | } 154 | 155 | ++iter; 156 | } 157 | } 158 | } 159 | 160 | void network::send(vector *msgs) { 161 | while (!msgs->empty()) { 162 | const Message& msg = (*msgs)[0]; 163 | stateMachine *sm = peers[msg.to()]; 164 | sm->step(msg); 165 | vector out; 166 | MessageVec readMsgs; 167 | msgs->erase(msgs->begin(), msgs->begin() + 1); 168 | sm->readMessages(&readMsgs); 169 | filter(readMsgs, &out); 170 | msgs->insert(msgs->end(), out.begin(), out.end()); 171 | } 172 | } 173 | 174 | void network::drop(uint64_t from, uint64_t to, int perc) { 175 | dropm[connem(from, to)] = perc; 176 | } 177 | 178 | void network::cut(uint64_t one, uint64_t other) { 179 | drop(one, other, 10); 180 | drop(other, one, 10); 181 | } 182 | 183 | void network::isolate(uint64_t id) { 184 | size_t i; 185 | for (i = 0; i < peers.size(); ++i) { 186 | uint64_t nid = i + 1; 187 | if (nid != id) { 188 | drop(id, nid, 10); 189 | drop(nid, id, 10); 190 | } 191 | } 192 | } 193 | 194 | void network::ignore(MessageType t) { 195 | ignorem[t] = true; 196 | } 197 | 198 | void network::recover() { 199 | dropm.clear(); 200 | ignorem.clear(); 201 | } 202 | 203 | void network::filter(const vector& msgs, vector *out) { 204 | size_t i; 205 | for (i = 0; i < msgs.size(); ++i) { 206 | Message *msg = msgs[i]; 207 | if (ignorem[msg->type()]) { 208 | continue; 209 | } 210 | 211 | int perc; 212 | switch (msg->type()) { 213 | case MsgHup: 214 | break; 215 | default: 216 | perc = dropm[connem(msg->from(), msg->to())]; 217 | if (rand() % 10 < perc) { 218 | continue; 219 | } 220 | } 221 | 222 | out->push_back(*msg); 223 | } 224 | } 225 | 226 | raft* newTestRaft(uint64_t id, const vector& peers, int election, int hb, Storage *s) { 227 | return newRaft(newTestConfig(id, peers, election, hb, s)); 228 | } 229 | 230 | Config* newTestConfig(uint64_t id, const vector& peers, int election, int hb, Storage *s) { 231 | Config *c = new Config(); 232 | c->id = id; 233 | c->peers = peers; 234 | c->electionTick = election; 235 | c->heartbeatTick = hb; 236 | c->storage = s; 237 | c->maxSizePerMsg = kNoLimit; 238 | c->maxInflightMsgs = 256; 239 | c->readOnlyOption = ReadOnlySafe; 240 | c->checkQuorum = false; 241 | return c; 242 | } 243 | 244 | -------------------------------------------------------------------------------- /test/raft_test_util.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #ifndef __LIBRAFT_RAFT_TEST_UTIL_H__ 6 | #define __LIBRAFT_RAFT_TEST_UTIL_H__ 7 | 8 | #include "libraft.h" 9 | #include "core/raft.h" 10 | #include "core/progress.h" 11 | #include "storage/memory_storage.h" 12 | 13 | using namespace libraft; 14 | 15 | enum stateMachineType { 16 | raftType = 0, 17 | blackHoleType = 1 18 | }; 19 | 20 | struct stateMachine { 21 | virtual ~stateMachine() {} 22 | 23 | virtual int step(const Message& ) = 0; 24 | virtual void readMessages(MessageVec *) = 0; 25 | 26 | virtual int type() = 0; 27 | virtual void* data() = 0; 28 | }; 29 | 30 | struct connem { 31 | uint64_t from, to; 32 | 33 | bool operator == (const connem& c) { 34 | return from == c.from && to == c.to; 35 | } 36 | 37 | void operator = (const connem& c) { 38 | from = c.from; 39 | to = c.to; 40 | } 41 | 42 | connem(uint64_t from, uint64_t to) 43 | : from(from), to(to) {} 44 | }; 45 | 46 | struct network { 47 | map peers; 48 | map storage; 49 | map dropm; 50 | map ignorem; 51 | 52 | ~network(); 53 | void send(vector* msgs); 54 | void drop(uint64_t from, uint64_t to, int perc); 55 | void cut(uint64_t one, uint64_t other); 56 | void isolate(uint64_t id); 57 | void ignore(MessageType t); 58 | void recover(); 59 | void filter(const vector& msg, vector *out); 60 | }; 61 | 62 | struct raftStateMachine : public stateMachine { 63 | raftStateMachine(Config *c); 64 | raftStateMachine(raft *); 65 | virtual ~raftStateMachine(); 66 | 67 | virtual int step(const Message& ); 68 | virtual void readMessages(MessageVec *); 69 | 70 | virtual int type() { return raftType; } 71 | virtual void* data() { return raft; } 72 | 73 | Config* config; 74 | raft *raft; 75 | }; 76 | 77 | struct blackHole : public stateMachine { 78 | blackHole() {} 79 | virtual ~blackHole() {} 80 | 81 | int step(const Message& ) { return OK; } 82 | void readMessages(MessageVec *) {} 83 | 84 | int type() { return blackHoleType; } 85 | void* data() { return NULL; } 86 | }; 87 | 88 | typedef void (*ConfigFun)(Config*); 89 | 90 | extern Config* newTestConfig(uint64_t id, const vector& peers, int election, int hb, Storage *s); 91 | extern raft* newTestRaft(uint64_t id, const vector& peers, int election, int hb, Storage *s); 92 | extern network* newNetworkWithConfig(ConfigFun fun, vector peers); 93 | extern network* newNetwork(const vector& peers); 94 | extern void nextEnts(raft *r, MemoryStorage *s, EntryVec *entries); 95 | extern string raftLogString(raftLog *log); 96 | extern void idsBySize(int size, vector* ids); 97 | 98 | static inline Entry 99 | initEntry(uint64_t index=0, uint64_t term=0,const string data = "") { 100 | Entry entry; 101 | entry.set_index(index); 102 | entry.set_term(term); 103 | entry.set_data(data); 104 | return entry; 105 | } 106 | 107 | static inline Message 108 | initMessage(uint64_t from=0, uint64_t to=0, const MessageType typ=MsgHup, EntryVec *entries = NULL, uint64_t index = 0) { 109 | Message msg; 110 | msg.set_from(from); 111 | msg.set_to(to); 112 | msg.set_index(index); 113 | msg.set_type(typ); 114 | if (entries != NULL) { 115 | uint32_t i; 116 | for (i = 0; i < entries->size(); ++i) { 117 | *(msg.add_entries()) = (*entries)[i]; 118 | } 119 | } 120 | return msg; 121 | } 122 | 123 | static inline Snapshot* 124 | newSnapshot(uint64_t index = 0, uint64_t term = 0) { 125 | Snapshot* s = new Snapshot(); 126 | SnapshotMetadata *meta = s->mutable_metadata(); 127 | 128 | if (index != 0) { 129 | meta->set_index(index); 130 | } 131 | if (term != 0) { 132 | meta->set_term(term); 133 | } 134 | return s; 135 | } 136 | 137 | static inline Progress 138 | initProgress(uint64_t next, int maxInfilght, ProgressState state, uint64_t match, uint64_t pendingSnapshot = 0) { 139 | Progress progress(next, maxInfilght); 140 | progress.state_ = state; 141 | progress.match_ = match; 142 | progress.pendingSnapshot_ = pendingSnapshot; 143 | 144 | return progress; 145 | } 146 | 147 | #define SIZEOF_ARRAY(array) sizeof(array) / sizeof(array[0]) 148 | 149 | #endif // __LIBRAFT_RAFT_TEST_UTIL_H__ 150 | -------------------------------------------------------------------------------- /test/record_test.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #include 6 | #include 7 | #include "raft_test_util.h" 8 | #include "base/crc32c.h" 9 | #include "base/io_error.h" 10 | #include "base/io_buffer.h" 11 | #include "base/util.h" 12 | #include "proto/record.pb.h" 13 | #include "wal/decoder.h" 14 | #include "wal/wal.h" 15 | 16 | using namespace std; 17 | using namespace walpb; 18 | 19 | typedef unsigned char byte; 20 | typedef vector ByteVector; 21 | 22 | const static byte infoDataBytes[] = "\b\xef\xfd\x02"; 23 | const static byte infoRecordBytes[] = "\x0e\x00\x00\x00\x00\x00\x00\x00\b\x01\x10\x99\xb5\xe4\xd0\x03\x1a\x04"; 24 | 25 | const static string infoData = string((char*)infoDataBytes, sizeof(infoDataBytes)); 26 | const static string infoRecord = string((char*)infoRecordBytes, sizeof(infoRecordBytes) - 1) + infoData; 27 | 28 | static inline Record 29 | initRecord(int64_t type, uint32_t crc, const string& data) { 30 | Record record; 31 | record.set_type(type); 32 | record.set_crc(crc); 33 | record.set_data(data); 34 | 35 | return record; 36 | } 37 | 38 | TEST(recordTests, TestReadRecord) { 39 | string badInfoRecord = string(infoRecord.c_str(), infoRecord.length() - 1); 40 | badInfoRecord[badInfoRecord.length() - 1] = 'a'; 41 | 42 | struct tmp { 43 | string data; 44 | Record wr; 45 | int we; 46 | } tests[] = { 47 | { 48 | .data = infoRecord, 49 | .wr = initRecord(1, Value(infoData.c_str(), infoData.length() - 1), string(infoData.c_str(), infoData.length() - 1)), 50 | .we = 0, 51 | }, 52 | { 53 | .data = "", .wr = Record(), .we = kEOF, 54 | }, 55 | { 56 | .data = infoRecord.substr(0,8), .wr = Record(), .we = kErrUnexpectedEOF, 57 | }, 58 | { 59 | .data = infoRecord.substr(0,infoRecord.length() - infoData.length() - 8), .wr = Record(), .we = kErrUnexpectedEOF, 60 | }, 61 | { 62 | .data = infoRecord.substr(0,infoRecord.length() - infoData.length()), .wr = Record(), .we = kErrUnexpectedEOF, 63 | }, 64 | { 65 | .data = infoRecord.substr(0,infoRecord.length() - 8), .wr = Record(), .we = kErrUnexpectedEOF, 66 | }, 67 | { 68 | .data = badInfoRecord, .wr = initRecord(1,0,""), .we = kErrCRCMismatch, 69 | }, 70 | }; 71 | 72 | uint32_t i; 73 | for (i = 0; i < SIZEOF_ARRAY(tests); ++i) { 74 | tmp& tt = tests[i]; 75 | Record record; 76 | vector buf = {newMemoryBufferWithString(tt.data)}; 77 | Decoder* dec = newDecoder(buf); 78 | int err = dec->decode(&record); 79 | 80 | ASSERT_EQ(err, tt.we) << "i:" << i << tt.data.length(); 81 | 82 | if (tt.we != kErrCRCMismatch) 83 | ASSERT_TRUE(isDeepEqualRecord(record, tt.wr)) << "i:" << i; 84 | 85 | delete dec; 86 | } 87 | } 88 | 89 | TEST(recordTests, TestWriteRecord) { 90 | } -------------------------------------------------------------------------------- /test/unstable_log_test.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) lichuang 3 | */ 4 | 5 | #include 6 | #include "libraft.h" 7 | #include "raft_test_util.h" 8 | #include "base/util.h" 9 | #include "storage/unstable_log.h" 10 | 11 | using namespace libraft; 12 | 13 | TEST(unstableLogTests, TestUnstableMaybeFirstIndex) { 14 | struct tmp { 15 | EntryVec entries; 16 | uint64_t offset; 17 | Snapshot *snapshot; 18 | bool wok; 19 | uint64_t windex; 20 | } tests[] = { 21 | // no snapshot 22 | { 23 | .entries = {initEntry(5,1)}, 24 | .offset = 5, .snapshot = NULL, 25 | .wok = false, .windex = 0, 26 | }, 27 | { 28 | .entries = {}, 29 | .offset = 0, .snapshot = NULL, 30 | .wok = false, .windex = 0, 31 | }, 32 | // has snapshot 33 | { 34 | .entries = {initEntry(5,1)}, 35 | .offset = 5, .snapshot = newSnapshot(4,1), 36 | .wok = true, .windex = 5, 37 | }, 38 | { 39 | .entries = {}, 40 | .offset = 5, .snapshot = newSnapshot(4,1), 41 | .wok = true, .windex = 5, 42 | }, 43 | }; 44 | 45 | size_t i; 46 | for (i = 0;i < SIZEOF_ARRAY(tests); ++i) { 47 | unstableLog unstable; 48 | unstable.entries_ = tests[i].entries; 49 | unstable.offset_ = tests[i].offset; 50 | unstable.snapshot_ = tests[i].snapshot; 51 | 52 | uint64_t index; 53 | bool ok = unstable.maybeFirstIndex(&index); 54 | EXPECT_EQ(tests[i].wok, ok); 55 | EXPECT_EQ(tests[i].windex, index); 56 | if (tests[i].snapshot != NULL) { 57 | delete tests[i].snapshot; 58 | } 59 | } 60 | } 61 | 62 | TEST(unstableLogTests, TestMaybeLastIndex) { 63 | struct tmp { 64 | EntryVec entries; 65 | uint64_t offset; 66 | Snapshot *snapshot; 67 | bool wok; 68 | uint64_t windex; 69 | } tests[] = { 70 | // last in entries 71 | { 72 | .entries = {initEntry(5,1),}, 73 | .offset = 5, .snapshot = NULL, 74 | .wok = true, .windex = 5, 75 | }, 76 | { 77 | .entries = {initEntry(5,1),}, 78 | .offset = 5, .snapshot = newSnapshot(4,1), 79 | .wok = true, .windex = 5, 80 | }, 81 | // last in snapshot 82 | { 83 | .entries = {}, 84 | .offset = 5, .snapshot = newSnapshot(4,1), 85 | .wok = true, .windex = 4, 86 | }, 87 | // empty unstable 88 | { 89 | .entries = {}, 90 | .offset = 0, .snapshot = NULL, 91 | .wok = false, .windex = 0, 92 | }, 93 | }; 94 | 95 | size_t i; 96 | for (i = 0;i < SIZEOF_ARRAY(tests); ++i) { 97 | unstableLog unstable; 98 | unstable.entries_ = tests[i].entries; 99 | unstable.offset_ = tests[i].offset; 100 | unstable.snapshot_ = tests[i].snapshot; 101 | 102 | uint64_t index; 103 | bool ok = unstable.maybeLastIndex(&index); 104 | EXPECT_EQ(tests[i].wok, ok); 105 | EXPECT_EQ(tests[i].windex, index); 106 | 107 | if (tests[i].snapshot != NULL) { 108 | delete tests[i].snapshot; 109 | } 110 | } 111 | } 112 | 113 | TEST(unstableLogTests, TestUnstableMaybeTerm) { 114 | struct tmp { 115 | EntryVec entries; 116 | uint64_t offset; 117 | Snapshot *snapshot; 118 | uint64_t index; 119 | bool wok; 120 | uint64_t wterm; 121 | } tests[] = { 122 | // term from entries 123 | { 124 | .entries = {initEntry(5,1)}, 125 | .offset = 5, .snapshot = NULL, 126 | .index = 5, .wok = true, .wterm = 1, 127 | }, 128 | { 129 | .entries = {initEntry(5,1)}, 130 | .offset = 5, .snapshot = NULL, 131 | .index = 6, .wok = false, .wterm = 0, 132 | }, 133 | { 134 | .entries = {initEntry(5,1)}, 135 | .offset = 5, .snapshot = NULL, 136 | .index = 4, .wok = false, .wterm = 0, 137 | }, 138 | { 139 | .entries = {initEntry(5,1)}, 140 | .offset = 5, .snapshot = newSnapshot(4,1), 141 | .index = 5, .wok = true, .wterm = 1, 142 | }, 143 | { 144 | .entries = {initEntry(5,1)}, 145 | .offset = 5, .snapshot = newSnapshot(4,1), 146 | .index = 6, .wok = false, .wterm = 0, 147 | }, 148 | // term from snapshot 149 | { 150 | .entries = {initEntry(5,1)}, 151 | .offset = 5, .snapshot = newSnapshot(4,1), 152 | .index = 4, .wok = true, .wterm = 1, 153 | }, 154 | { 155 | .entries = {initEntry(5,1)}, 156 | .offset = 5, .snapshot = newSnapshot(4,1), 157 | .index = 3, .wok = false, .wterm = 0, 158 | }, 159 | { 160 | .entries = {}, 161 | .offset = 5, .snapshot = newSnapshot(4,1), 162 | .index = 5, .wok = false, .wterm = 0, 163 | }, 164 | { 165 | .entries = {}, 166 | .offset = 5, .snapshot = newSnapshot(4,1), 167 | .index = 4, .wok = true, .wterm = 1, 168 | }, 169 | { 170 | .entries = {}, 171 | .offset = 0, .snapshot = NULL, 172 | .index = 5, .wok = false, .wterm = 0, 173 | }, 174 | }; 175 | 176 | size_t i; 177 | for (i = 0;i < SIZEOF_ARRAY(tests); ++i) { 178 | unstableLog unstable; 179 | unstable.entries_ = tests[i].entries; 180 | unstable.offset_ = tests[i].offset; 181 | unstable.snapshot_ = tests[i].snapshot; 182 | 183 | uint64_t term; 184 | bool ok = unstable.maybeTerm(tests[i].index, &term); 185 | EXPECT_EQ(tests[i].wok, ok) << "i: " << i << ", index: " << tests[i].index; 186 | EXPECT_EQ(tests[i].wterm, term); 187 | 188 | if (tests[i].snapshot != NULL) { 189 | delete tests[i].snapshot; 190 | } 191 | } 192 | } 193 | 194 | TEST(unstableLogTests, TestUnstableRestore) { 195 | unstableLog unstable; 196 | unstable.entries_ = {initEntry(5,1)};; 197 | unstable.offset_ = 5; 198 | unstable.snapshot_ = newSnapshot(4,1); 199 | 200 | Snapshot s; 201 | { 202 | SnapshotMetadata *tmp_meta = s.mutable_metadata(); 203 | tmp_meta->set_index(6); 204 | tmp_meta->set_term(2); 205 | unstable.restore(s); 206 | } 207 | 208 | EXPECT_EQ(unstable.offset_, s.metadata().index() + 1); 209 | EXPECT_EQ((int)unstable.entries_.size(), 0); 210 | EXPECT_TRUE(isDeepEqualSnapshot(unstable.snapshot_, &s)); 211 | 212 | delete unstable.snapshot_; 213 | } 214 | 215 | TEST(unstableLogTests, TestUnstableStableTo) { 216 | struct tmp { 217 | EntryVec entries; 218 | uint64_t offset; 219 | Snapshot *snapshot; 220 | uint64_t index, term; 221 | 222 | uint64_t woffset; 223 | int wlen; 224 | 225 | } tests[] = { 226 | { 227 | .entries = {}, 228 | .offset = 0, .snapshot = NULL, 229 | .index = 5, .term = 1, 230 | .woffset = 0, .wlen = 0, 231 | }, 232 | { 233 | .entries = {initEntry(5,1)}, 234 | .offset = 5, .snapshot = NULL, 235 | .index = 5, .term = 1, // stable to the first entry 236 | .woffset = 6, .wlen = 0, 237 | }, 238 | { 239 | .entries = {initEntry(5,1), initEntry(6,1)}, 240 | .offset = 5, .snapshot = NULL, 241 | .index = 5, .term = 1, // stable to the first entry 242 | .woffset = 6, .wlen = 1, 243 | }, 244 | { 245 | .entries = {initEntry(6,2)}, 246 | .offset = 6, .snapshot = NULL, 247 | .index = 6, .term = 1, // stable to the first entry and term mismatch 248 | .woffset = 6, .wlen = 1, 249 | }, 250 | { 251 | .entries = {initEntry(5,1)}, 252 | .offset = 5, .snapshot = NULL, 253 | .index = 4, .term = 1, // stable to old entry 254 | .woffset = 5, .wlen = 1, 255 | }, 256 | { 257 | .entries = {initEntry(5,1)}, 258 | .offset = 5, .snapshot = NULL, 259 | .index = 4, .term = 2, // stable to old entry 260 | .woffset = 5, .wlen = 1, 261 | }, 262 | // with snapshot 263 | { 264 | .entries = {initEntry(5,1)}, 265 | .offset = 5, .snapshot = newSnapshot(4,1), 266 | .index = 5, .term = 1, // stable to the first entry 267 | .woffset = 6, .wlen = 0, 268 | }, 269 | { 270 | .entries = {initEntry(5,1), initEntry(6,1)}, 271 | .offset = 5, .snapshot = newSnapshot(4,1), 272 | .index = 5, .term = 1, // stable to the first entry 273 | .woffset = 6, .wlen = 1, 274 | }, 275 | { 276 | .entries = {initEntry(6,2)}, 277 | .offset = 6, .snapshot = newSnapshot(5,1), 278 | .index = 6, .term = 1, // stable to the first entry and term mismatch 279 | .woffset = 6, .wlen = 1, 280 | }, 281 | { 282 | .entries = {initEntry(5,1)}, 283 | .offset = 5, .snapshot = newSnapshot(5,1), 284 | .index = 4, .term = 1, // stable to snapshot 285 | .woffset = 5, .wlen = 1, 286 | }, 287 | { 288 | .entries = {initEntry(5,2)}, 289 | .offset = 5, .snapshot = newSnapshot(4,2), 290 | .index = 4, .term = 1, // stable to snapshot 291 | .woffset = 5, .wlen = 1, 292 | }, 293 | }; 294 | 295 | size_t i; 296 | for (i = 0;i < SIZEOF_ARRAY(tests); ++i) { 297 | unstableLog unstable; 298 | unstable.entries_ = tests[i].entries; 299 | unstable.offset_ = tests[i].offset; 300 | unstable.snapshot_ = tests[i].snapshot; 301 | 302 | unstable.stableTo(tests[i].index, tests[i].term); 303 | EXPECT_EQ(unstable.offset_, tests[i].woffset) << "i: " << i << ", woffset: " << tests[i].woffset; 304 | EXPECT_EQ((int)unstable.entries_.size(), tests[i].wlen); 305 | 306 | if (tests[i].snapshot != NULL) { 307 | delete tests[i].snapshot; 308 | } 309 | } 310 | } 311 | 312 | TEST(unstableLogTests, TestUnstableTruncateAndAppend) { 313 | struct tmp { 314 | EntryVec entries; 315 | uint64_t offset; 316 | Snapshot *snapshot; 317 | EntryVec toappend; 318 | uint64_t woffset; 319 | EntryVec wentries; 320 | } tests[] = { 321 | // append to the end 322 | { 323 | .entries = {initEntry(5,1),}, 324 | .offset = 5, .snapshot = NULL, 325 | .toappend = {initEntry(6,1),initEntry(7,1),}, 326 | .woffset = 5, 327 | .wentries = {initEntry(5,1),initEntry(6,1),initEntry(7,1)}, 328 | }, 329 | // replace the unstable entries 330 | { 331 | .entries = {initEntry(5,1),}, 332 | .offset = 5, .snapshot = NULL, 333 | .toappend = {initEntry(5,2),initEntry(6,2),}, 334 | .woffset = 5, 335 | .wentries = {initEntry(5,2),initEntry(6,2),}, 336 | }, 337 | { 338 | .entries = {initEntry(5,1),}, 339 | .offset = 5, .snapshot = NULL, 340 | .toappend = {initEntry(4,2),initEntry(5,2),initEntry(6,2),}, 341 | .woffset = 4, 342 | .wentries = {initEntry(4,2),initEntry(5,2),initEntry(6,2),}, 343 | }, 344 | // truncate the existing entries and append 345 | { 346 | .entries = {initEntry(5,1),initEntry(6,1),initEntry(7,1),}, 347 | .offset = 5, .snapshot = NULL, 348 | .toappend = {initEntry(6,2),}, 349 | .woffset = 5, 350 | .wentries = {initEntry(5,1),initEntry(6,2),}, 351 | }, 352 | { 353 | .entries = {initEntry(5,1),initEntry(6,1),initEntry(7,1),}, 354 | .offset = 5, .snapshot = NULL, 355 | .toappend = {initEntry(7,2),initEntry(8,2),}, 356 | .woffset = 5, 357 | .wentries = {initEntry(5,1),initEntry(6,1),initEntry(7,2),initEntry(8,2),}, 358 | }, 359 | }; 360 | 361 | size_t i; 362 | for (i = 0;i < SIZEOF_ARRAY(tests); ++i) { 363 | unstableLog unstable; 364 | unstable.entries_ = tests[i].entries; 365 | unstable.offset_ = tests[i].offset; 366 | unstable.snapshot_ = tests[i].snapshot; 367 | 368 | unstable.truncateAndAppend(tests[i].toappend); 369 | EXPECT_EQ(unstable.offset_, tests[i].woffset) << "i: " << i << ", woffset: " << tests[i].woffset; 370 | EXPECT_TRUE(isDeepEqualEntries(unstable.entries_, tests[i].wentries)) << "i: " << i; 371 | 372 | if (tests[i].snapshot != NULL) { 373 | delete tests[i].snapshot; 374 | } 375 | } 376 | } 377 | --------------------------------------------------------------------------------