├── .gitignore ├── .travis.yml ├── Cargo.toml ├── LICENSE ├── README.md ├── benches └── log.rs ├── build.rs ├── examples ├── hashmap.rs └── register.rs ├── experiments ├── playbooks │ └── install.yml └── tmux │ └── hashmap-local-3 │ ├── cas │ ├── dashboard │ ├── get │ ├── put │ └── server ├── raft.png └── src ├── backoff.rs ├── client.rs ├── connection.rs ├── consensus.rs ├── lib.rs ├── messages.capnp ├── messages.rs ├── persistent_log ├── fs.rs ├── mem.rs └── mod.rs ├── server.rs ├── state.rs └── state_machine ├── channel.rs ├── mod.rs └── null.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /Cargo.lock 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Use containers 2 | sudo: false 3 | 4 | # Yup we use rust. 5 | language: rust 6 | 7 | # Test only on nightly for now. 8 | rust: 9 | - nightly 10 | 11 | env: 12 | global: 13 | - secure: eotueXoyGdW2TIser1HAj5I1l5KAmRww5tW5uMNWA7ytVrwibQ4qVCUhMZ3ZUBXWJsMvsebdXnmfqKwyZ84FqjPmsYv9WTfzatYyMcMUlzkPITfsUoJ03fcUcKc4gVyV5SifIuDWCWBMX+LG6eU3I/CqjgOFw6NrBoHhLTwh4yc= 14 | - CC=gcc-4.8 15 | - CXX=g++-4.8 16 | - RUST_LOG=raft=debug 17 | - RUST_BACKTRACE=1 18 | - TRAVIS_CARGO_NIGHTLY_FEATURE="" 19 | 20 | # `sudo`-less apt install. 21 | addons: 22 | apt: 23 | sources: 24 | - ubuntu-toolchain-r-test 25 | packages: 26 | # Needed for building Cap'n Proto. 27 | - gcc-4.8 28 | - g++-4.8 29 | # Needed for `travis-cargo coveralls --no-sudo` 30 | - libcurl4-openssl-dev 31 | - libelf-dev 32 | - libdw-dev 33 | 34 | # We need to install Cap'n Proto. 35 | install: 36 | - git clone https://github.com/kentonv/capnproto.git 37 | - cd capnproto/c++ 38 | # Use master to avoid autotools breakage (13 May 2016) 39 | #- git checkout release-0.5.3 40 | - ./setup-autotools.sh 41 | - autoreconf -i 42 | - ./configure --disable-shared 43 | - make -j5 44 | - export PATH="$PATH:$(pwd)" 45 | - export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$(pwd)" 46 | - cd ../.. 47 | 48 | # Load `travis-cargo` 49 | before_script: 50 | - pip install 'travis-cargo' --user 51 | - export PATH=$HOME/.local/bin:$PATH 52 | 53 | script: 54 | - travis-cargo build 55 | - travis-cargo test 56 | - travis-cargo doc 57 | 58 | # Generate Docs and coverage 59 | after_success: 60 | - travis-cargo doc-upload 61 | - travis-cargo coveralls --no-sudo 62 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "raft" 3 | version = "0.0.1" 4 | authors = [ 5 | "Andrew Hobden ", 6 | "Dan Burkert ", 7 | "James McGlashan ", 8 | ] 9 | description = """ 10 | Today our systems operate in extreme conditions, functioning across containers, 11 | virtual machines, infrastructure, networks, embedded systems, in our pockets, 12 | and even inside of us. Many of these systems depend on one another for 13 | operation, others are able to operate in failing connectivity without disaster. 14 | In many cases it is preferable to have the latter, especially if the operation 15 | the continued operation of the system is at stake. Distributed consensus 16 | represents one small part of a larger system, and offer the ability to maintain 17 | a replicated persistent log containing actions that are applied globally into 18 | a state machine. This allows *n* clients to communicate to a cluster of *m* 19 | servers in a stable and predictable manner, even in failing network conditions. 20 | 21 | Using Ongaro and Osterhouts's Raft algorithm we are developing a fast, low 22 | level, low requirements implementation of the system in an unopinionated, 23 | minimal way. The Raft library interfaces with custom or preexisting Log and 24 | State Machine implementations, providing a Client, Server, and Consensus Module 25 | within its core. We have chosen an asynchronous single threaded event loop model 26 | in the Rust language, allowing our implementation to have strong safety and 27 | performance characteristics with low demands. Communication, a primary 28 | performance concern, is kept as lightweight and fast as possible by using 29 | Renshaw's Cap'n Proto implementation. We are currently exploring opportunities 30 | in trust and security as well as testing our implementation for further failure 31 | conditions.""" 32 | readme = "README.md" 33 | keywords = [ 34 | "Raft", "Distributed Computing", "Consensus", "State Machine", 35 | "Persistent Log", "Networking", 36 | ] 37 | license = "MIT" 38 | 39 | # Builds Cap'n Proto messages 40 | build = "build.rs" 41 | 42 | # Dependencies 43 | [build-dependencies] 44 | capnpc = "0.5" 45 | 46 | [dependencies] 47 | bufstream = "0.1" 48 | byteorder = "*" 49 | capnp = "0.6" 50 | capnp-nonblock = "0.4" 51 | log = "0.3" 52 | mio = "0.5" 53 | rand = "0.3" 54 | scoped_log = "0.1" 55 | uuid = "0.1" 56 | wrapped_enum = "0.1" 57 | 58 | [dev-dependencies] 59 | env_logger = "0.4" 60 | # Used in Examples 61 | docopt = "0.7" 62 | serde = "0.9" 63 | serde_json = "0.9" 64 | serde_derive = "0.9" 65 | rustc-serialize = "0.3" 66 | bincode = "0.9.2" 67 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 The Raft project developers 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Raft-rs # 2 | 3 | > Note: This project is of **alpha** quality. **APIs are still in some flux, but they are ready for you to play with them.** A stable version will be released when we feel it is ready. 4 | 5 | [![Build Status](https://img.shields.io/travis/Hoverbear/raft-rs/master.svg)](https://travis-ci.org/Hoverbear/raft-rs) 6 | [![Coverage Status](https://img.shields.io/coveralls/Hoverbear/raft-rs/master.svg)](https://coveralls.io/github/Hoverbear/raft-rs) 7 | 8 | **[Development Updates](http://www.hoverbear.org/tag/raft/)** 9 | 10 | ## Problem and Importance ## 11 | 12 | When building a distributed system one principal goal is often to build in *fault-tolerance*. That is, if one particular node in a network goes down, or if there is a network partition, the entire cluster does not fall over. The cluster of nodes taking part in a distributed consensus protocol must come to agreement regarding values, and once that decision is reached, that choice is final. 13 | 14 | Distributed Consensus Algorithms often take the form of a replicated state machine and log. Each state machine accepts inputs from its log, and represents the value(s) to be replicated, for example, a hash table. They allow a collection of machines to work as a coherent group that can survive the failures of some of its members. 15 | 16 | Two well known Distributed Consensus Algorithms are Paxos and Raft. Paxos is used in systems like [Chubby](http://research.google.com/archive/chubby.html) by Google, and Raft is used in things like [`etcd`](https://github.com/coreos/etcd/tree/master/raft). Raft is generally seen as a more understandable and simpler to implement than Paxos, and was chosen for this project for this reason. 17 | 18 | 19 | ## Documentation ## 20 | 21 | * [Raft Crate Documentation](https://hoverbear.github.io/raft-rs/raft/) 22 | * [The Raft site](https://raftconsensus.github.io/) 23 | * [The Secret Lives of Data - Raft](http://thesecretlivesofdata.com/raft/) 24 | * [Raft Paper](http://ramcloud.stanford.edu/raft.pdf) 25 | * [Raft Dissertation](https://github.com/ongardie/dissertation#readme) 26 | * [Raft Refloated](https://www.cl.cam.ac.uk/~ms705/pub/papers/2015-osr-raft.pdf) 27 | 28 | ## Compiling ## 29 | 30 | > For Linux, BSD, or Mac. Windows is not supported at this time. We are willing and interested in including support, however none of our contributors work on Windows. Your PRs are welcome! 31 | 32 | You will need the [Rust](http://rust-lang.org/) compiler: 33 | 34 | ```bash 35 | curl -L https://static.rust-lang.org/rustup.sh > rustup 36 | chmod +x rustup 37 | ./rustup --channel=nightly 38 | ``` 39 | 40 | > We require the `nightly` channel for now. 41 | 42 | This should install `cargo` and `rustc`. Next, you'll need `capnp` to build the 43 | `messages.canpnp` file . It is suggested to use the [git method](https://capnproto.org/install.html#installation-unix) 44 | 45 | ```bash 46 | git clone https://github.com/sandstorm-io/capnproto.git 47 | cd capnproto/c++ 48 | ./setup-autotools.sh 49 | autoreconf -i 50 | ./configure 51 | make -j6 check 52 | sudo make install 53 | ``` 54 | 55 | Finally, clone the repository and build it: 56 | 57 | ```bash 58 | git clone git@github.com:Hoverbear/raft-rs.git && \ 59 | cd raft-rs && \ 60 | cargo build 61 | ``` 62 | 63 | > Note this is a library, so building won't necessarily produce anything useful for you unless you're developing. 64 | 65 | ## Examples ## 66 | 67 | You can run a single-node `register` example like this: 68 | 69 | ```bash 70 | RUST_LOG=raft=debug cargo run --example register server 1 1 127.0.0.1:8080 71 | ``` 72 | 73 | There are currently examples showing: 74 | 75 | * **Register:** A single shared, replicated buffer for storing some data. Uses `bincode`. 76 | * **Hashmap:** A replicated hash table that stores `json::Value` with `String`s as keys. Uses `serde`. 77 | 78 | For a multi-node example (`hashmap` shown for variety), make sure to include all the peers on all instances: 79 | ```bash 80 | # Node 1 81 | RUST_LOG=raft=debug cargo run --example hashmap server 1 1 127.0.0.1:8080 2 127.0.0.1:8081 82 | # Node 2 83 | RUST_LOG=raft=debug cargo run --example hashmap server 2 1 127.0.0.1:8080 2 127.0.0.1:8081 84 | ``` 85 | 86 | We'd love it if you contributed your own or expanded on ours! 87 | 88 | ## Testing ## 89 | 90 | You can run the `raft` crate's full bank of tests with all debug output like so: 91 | 92 | ```bash 93 | RUST_BACKTRACE=1 RUST_LOG=raft=debug cargo test -- --nocapture 94 | ``` 95 | 96 | For something more terse use `cargo test`. 97 | 98 | ## Contributing ## 99 | 100 | **First timer with Git?** Check [this](https://github.com/hoverbear/rust-rosetta#contributing-1) out for some help!! 101 | 102 | We use [Homu](http://homu.io/q/Hoverbear/raft) for merging requests. **This means we cannot merge your code unless it passes tests!** 103 | -------------------------------------------------------------------------------- /benches/log.rs: -------------------------------------------------------------------------------- 1 | #![feature(test)] 2 | #![cfg(test)] 3 | 4 | extern crate raft; 5 | extern crate rand; 6 | extern crate test; 7 | 8 | use std::fs::remove_file; 9 | use std::path::Path; 10 | 11 | use rand::Rng; 12 | 13 | use raft::persistent_log::FsLog; 14 | use raft::{Log, LogIndex, Term}; 15 | 16 | #[bench] 17 | fn bench_log_control(b: &mut test::Bencher) { 18 | let mut rng = rand::OsRng::new().unwrap(); 19 | b.iter(|| { 20 | let i: u64 = rng.gen(); 21 | let name = format!("/tmp/raft-rs-bench-log-control-{:016x}", i); 22 | let filename = Path::new(&name); 23 | let log = FsLog::new(&filename).unwrap(); 24 | let x = log.latest_log_index(); 25 | remove_file(&filename).expect("Could not remove file"); 26 | x 27 | }); 28 | } 29 | 30 | fn do_bench_append(b: &mut test::Bencher, name: &str, count: usize) { 31 | let mut rng = rand::OsRng::new().unwrap(); 32 | let values: Vec = (0..255).collect(); 33 | let mut entries = vec![]; 34 | for x in 0..count { 35 | entries.push((Term::from(0x1234abcd8765fedc), &values[(x % 100)..(x % 100 + 100)])); 36 | } 37 | b.iter(|| { 38 | let i: u64 = rng.gen(); 39 | let name = format!("/tmp/raft-rs-bench-log-{}-{:016x}", name, i); 40 | let filename = Path::new(&name); 41 | let mut log = FsLog::new(&filename).unwrap(); 42 | log.append_entries( 43 | LogIndex::from(1), 44 | &entries[..], 45 | ).expect("appending entries"); 46 | let x = log.latest_log_index(); 47 | remove_file(&filename).expect("Could not remove file"); 48 | x 49 | }); 50 | } 51 | 52 | fn do_bench_append_then_rewrite(b: &mut test::Bencher, name: &str, count: usize, rewrite: usize, from: LogIndex) { 53 | let mut rng = rand::OsRng::new().unwrap(); 54 | let values: Vec = (0..100).collect(); 55 | let mut initial_entries = vec![]; 56 | let mut rewrite_entries = vec![]; 57 | for x in 0..count { 58 | initial_entries.push((Term::from(0x12), &values[(x % 100)..(x % 100 + 1)])); 59 | } 60 | for x in 0..rewrite { 61 | rewrite_entries.push((Term::from(0x30af), &values[((rewrite - x) % 100)..((rewrite - x) % 100 + 1)])); 62 | } 63 | b.iter(|| { 64 | let i: u64 = rng.gen(); 65 | let name = format!("/tmp/raft-rs-bench-log-{}-{:016x}", name, i); 66 | let filename = Path::new(&name); 67 | let mut log = FsLog::new(&filename).unwrap(); 68 | log.append_entries(LogIndex::from(1), &initial_entries[..]).expect("append entries"); 69 | log.append_entries(from, &rewrite_entries[..]).expect("rewrite entries"); 70 | 71 | let x = log.latest_log_index(); 72 | remove_file(&filename).expect("Could not remove file"); 73 | x 74 | }); 75 | 76 | } 77 | 78 | #[bench] 79 | fn bench_log_append_0(b: &mut test::Bencher) { 80 | do_bench_append(b, "append0", 0); 81 | } 82 | 83 | #[bench] 84 | fn bench_log_append_1(b: &mut test::Bencher) { 85 | do_bench_append(b, "append1", 1); 86 | } 87 | 88 | 89 | #[bench] 90 | fn bench_log_append_10(b: &mut test::Bencher) { 91 | do_bench_append(b, "append10", 10); 92 | } 93 | 94 | #[bench] 95 | fn bench_log_append_100(b: &mut test::Bencher) { 96 | do_bench_append(b, "append100", 100); 97 | } 98 | #[bench] 99 | fn bench_log_append_1000(b: &mut test::Bencher) { 100 | do_bench_append(b, "append1000", 1000); 101 | } 102 | 103 | #[bench] 104 | fn bench_log_rewrite_100_1(b: &mut test::Bencher) { 105 | do_bench_append_then_rewrite(b, "rewrite100.1", 100, 1, LogIndex::from(50)); 106 | } 107 | 108 | #[bench] 109 | fn bench_log_rewrite_100_50(b: &mut test::Bencher) { 110 | do_bench_append_then_rewrite(b, "rewrite100.50", 100, 50, LogIndex::from(50)); 111 | } 112 | 113 | #[bench] 114 | fn bench_log_rewrite_100_100(b: &mut test::Bencher) { 115 | do_bench_append_then_rewrite(b, "rewrite100.100", 100, 100, LogIndex::from(50)); 116 | } 117 | -------------------------------------------------------------------------------- /build.rs: -------------------------------------------------------------------------------- 1 | use std::path::Path; 2 | 3 | extern crate capnpc; 4 | 5 | fn main() { 6 | ::capnpc::compile(Path::new("src"), &[Path::new("src/messages.capnp")]).unwrap(); 7 | } 8 | -------------------------------------------------------------------------------- /examples/hashmap.rs: -------------------------------------------------------------------------------- 1 | //! This example demonstrates using Raft to implement a replicated hashmap over `n` servers and 2 | //! interact with them over `m` clients. 3 | //! 4 | //! This example uses Serde serialization. 5 | //! 6 | //! Comments below will aim to be tailored towards Raft and it's usage. If you have any questions, 7 | //! Please, just open an issue. 8 | //! 9 | //! TODO: For the sake of simplicity of this example, we don't implement a `Log` and just use a 10 | //! simple testing one. We should improve this in the future. 11 | 12 | extern crate raft; // <--- Kind of a big deal for this! 13 | extern crate env_logger; 14 | #[macro_use] extern crate log; 15 | #[macro_use] extern crate scoped_log; 16 | extern crate docopt; 17 | extern crate serde; 18 | extern crate serde_json; 19 | #[macro_use] extern crate serde_derive; 20 | extern crate rustc_serialize; 21 | 22 | use std::net::{SocketAddr, ToSocketAddrs}; 23 | use std::collections::HashMap; 24 | 25 | use serde_json::Value; 26 | use docopt::Docopt; 27 | 28 | // Raft's major components. See comments in code on usage and things. 29 | use raft::{ 30 | Server, 31 | Client, 32 | state_machine, 33 | persistent_log, 34 | ServerId, 35 | }; 36 | // A payload datatype. We're just using a simple enum. You can use whatever. 37 | use Message::*; 38 | 39 | // Using docopt we define the overall usage of the application. 40 | static USAGE: &'static str = " 41 | A replicated mutable hashmap. Operations on the register have serializable 42 | consistency, but no durability (once all register servers are terminated the 43 | map is lost). 44 | 45 | Each register server holds a replica of the map, and coordinates with its 46 | peers to update the maps values according to client commands. The register 47 | is available for reading and writing only if a majority of register servers are 48 | available. 49 | 50 | 51 | Commands: 52 | 53 | get Returns the current value of the key. 54 | 55 | put Sets the current value of the key, and returns the previous 56 | value. 57 | 58 | cas (compare and set) Conditionally sets the value of the key if the 59 | current value matches an expected value, returning true if the 60 | key was set. 61 | 62 | server Starts a key server. Servers must be provided a unique ID and 63 | address (ip:port) at startup, along with the ID and address of all 64 | peer servers. 65 | 66 | Usage: 67 | hashmap get ()... 68 | hashmap put ()... 69 | hashmap cas ()... 70 | hashmap server [( )]... 71 | hashmap (-h | --help) 72 | 73 | Options: 74 | -h --help Show a help message. 75 | "; 76 | 77 | #[derive(Debug, RustcDecodable)] 78 | struct Args { 79 | cmd_server: bool, 80 | cmd_get: bool, 81 | cmd_put: bool, 82 | cmd_cas: bool, 83 | 84 | // When creating a server you will necessarily need some sort of unique ID for it as well 85 | // as a list of peers. In this example we just accept them straight from args. You might 86 | // find it best to use a `toml` or `yaml` or `json` file. 87 | arg_id: Option, 88 | arg_node_id: Vec, 89 | arg_node_address: Vec, 90 | 91 | // In this example keys and values are associated. In your application you can model your data 92 | // however you please. 93 | arg_key: String, 94 | arg_new_value: String, 95 | arg_expected_value: String, 96 | } 97 | 98 | /// This is the defined message type for this example. For the sake of simplicity we don't go very 99 | /// far with this. In a "real" application you may want to more distinctly distinguish between 100 | /// data meant for `.query()` and data meant for `.propose()`. 101 | #[derive(Serialize, Deserialize)] 102 | pub enum Message { 103 | Get(String), 104 | Put(String, Value), 105 | Cas(String, Value, Value), 106 | } 107 | 108 | /// Just a plain old boring "parse args and dispatch" call. 109 | fn main() { 110 | let _ = env_logger::init(); 111 | let args: Args = Docopt::new(USAGE) 112 | .and_then(|d| d.decode()) 113 | .unwrap_or_else(|e| e.exit()); 114 | if args.cmd_server { 115 | server(&args); 116 | } else if args.cmd_get { 117 | get(&args); 118 | } else if args.cmd_put { 119 | put(&args); 120 | } else if args.cmd_cas { 121 | cas(&args); 122 | } 123 | } 124 | 125 | /// A simple convenience method since this is an example and it should exit if given invalid params. 126 | fn parse_addr(addr: &str) -> SocketAddr { 127 | addr.to_socket_addrs() 128 | .expect(&format!("unable to parse socket address: {}", addr)) 129 | .next() 130 | .unwrap() 131 | } 132 | 133 | /// Creates a Raft server using the specified ID from the list of nodes. 134 | fn server(args: &Args) { 135 | // Creating a raft server requires several things: 136 | 137 | // A persistent log implementation, which manages the persistent, replicated log... 138 | let log = persistent_log::MemLog::new(); 139 | 140 | // A state machine which replicates state. This state should be the same on all nodes. 141 | let state_machine = HashmapStateMachine::new(); 142 | 143 | // As well as a unique server id. 144 | let id = ServerId::from(args.arg_id.unwrap()); 145 | 146 | // ... And a list of peers. 147 | let mut peers = args.arg_node_id 148 | .iter() 149 | .zip(args.arg_node_address.iter()) 150 | .map(|(&id, addr)| (ServerId::from(id), parse_addr(addr))) 151 | .collect::>(); 152 | 153 | // The Raft Server will return an error if its ID is inside of its peer set. Don't do that. 154 | // Instead, take it out and use it! 155 | let addr = peers.remove(&id).unwrap(); 156 | 157 | // Using all of the above components. 158 | // You probably shouldn't `.unwrap()` in production code unless you're totally sure it works 159 | // 100% of the time, all the time. 160 | Server::new(id, addr, log, state_machine) 161 | .with_election_min_millis(1500) 162 | .with_election_max_millis(3000) 163 | .with_heartbeat_millis(1000) 164 | .with_peers(peers) 165 | .run() 166 | .unwrap(); 167 | } 168 | 169 | /// Gets a value for a given key from the provided Raft cluster. 170 | fn get(args: &Args) { 171 | // Clients necessarily need to now the valid set of nodes which they can talk to. 172 | // This is both so they can try to talk to all the nodes if some are failing, and so that it 173 | // can verify that it's not being lead astray somehow in redirections on leadership changes. 174 | let cluster = args.arg_node_address.iter() 175 | .map(|v| parse_addr(v)) 176 | .collect(); 177 | 178 | // Clients can be stored and reused, or used once and discarded. 179 | // There is very small overhead in connecting a new client to a cluster as it must discover and 180 | // identify itself to the leader. 181 | let mut client = Client::new(cluster); 182 | 183 | // In this example `serde::json` is used to serialize and deserialize messages. 184 | // Since Raft accepts `[u8]` the way you structure your data, the serialization method you 185 | // choose, and how you interpret that data is entirely up to you. 186 | let payload = serde_json::to_string(&Message::Get(args.arg_key.clone())).unwrap(); 187 | 188 | // A query executes **immutably** on the leader of the cluster and does not pass through the 189 | // persistent log. This is intended for querying the current state of the state machine. 190 | let response = client.query(payload.as_bytes()).unwrap(); 191 | 192 | // A response will block until it's query is complete. This is intended and expected behavior 193 | // based on the papers specifications. 194 | println!("{}", String::from_utf8(response).unwrap()) 195 | } 196 | 197 | /// Sets a value for a given key in the provided Raft cluster. 198 | fn put(args: &Args) { 199 | // Same as above. 200 | let cluster = args.arg_node_address.iter() 201 | .map(|v| parse_addr(v)) 202 | .collect(); 203 | 204 | let mut client = Client::new(cluster); 205 | 206 | let new_value = serde_json::to_value(&args.arg_new_value).unwrap(); 207 | let payload = serde_json::to_string(&Message::Put(args.arg_key.clone(), new_value)).unwrap(); 208 | 209 | // A propose will go through the persistent log and mutably modify the state machine in some 210 | // way. This is **much** slower than `.query()`. 211 | let response = client.propose(payload.as_bytes()).unwrap(); 212 | 213 | // A response will block until it's proposal is complete. This is intended and expected behavior 214 | // based on the papers specifications. 215 | println!("{}", String::from_utf8(response).unwrap()) 216 | } 217 | 218 | /// Compares and sets a value for a given key in the provided Raft cluster if the value is what is 219 | /// expected. 220 | fn cas(args: &Args) { 221 | // Same as above. 222 | let cluster = args.arg_node_address.iter() 223 | .map(|v| parse_addr(v)) 224 | .collect(); 225 | 226 | let mut client = Client::new(cluster); 227 | 228 | let new_value = serde_json::to_value(&args.arg_new_value).unwrap(); 229 | let expected_value = serde_json::to_value(&args.arg_expected_value).unwrap(); 230 | let payload = serde_json::to_string(&Message::Cas(args.arg_key.clone(), expected_value, new_value)).unwrap(); 231 | 232 | let response = client.propose(payload.as_bytes()).unwrap(); 233 | 234 | println!("{}", String::from_utf8(response).unwrap()) 235 | } 236 | 237 | /// A state machine that holds a hashmap. 238 | #[derive(Debug, Default)] 239 | pub struct HashmapStateMachine { 240 | map: HashMap, 241 | } 242 | 243 | /// Implement anything you want... A `new()` is generally a great idea. 244 | impl HashmapStateMachine { 245 | pub fn new() -> HashmapStateMachine { 246 | HashmapStateMachine { 247 | map: HashMap::new(), 248 | } 249 | } 250 | } 251 | 252 | /// Implementing `state_machine::StateMachine` allows your application specific state machine to be 253 | /// used in Raft. Feel encouraged to base yours of one of ours in these examples. 254 | impl state_machine::StateMachine for HashmapStateMachine { 255 | 256 | /// `apply()` is called on when a client's `.propose()` is commited and reaches the state 257 | /// machine. At this point it is durable and is going to be applied on at least half the nodes 258 | /// within the next couple round trips. 259 | fn apply(&mut self, new_value: &[u8]) -> Vec { 260 | scoped_info!("Applying {:?}", String::from_utf8_lossy(new_value)); 261 | // Deserialize 262 | let string = String::from_utf8_lossy(new_value); 263 | let message = serde_json::from_str(&string).unwrap(); 264 | 265 | // Handle 266 | let response = match message { 267 | Get(key) => { 268 | let old_value = &self.map.get(&key).cloned(); 269 | serde_json::to_string(old_value) 270 | }, 271 | Put(key, value) => { 272 | let old_value = &self.map.insert(key, value); 273 | serde_json::to_string(old_value) 274 | }, 275 | Cas(key, old_check, new) => { 276 | if self.map[&key] == old_check { 277 | let _ = self.map.insert(key, new); 278 | serde_json::to_string(&true) 279 | } else { 280 | serde_json::to_string(&false) 281 | } 282 | }, 283 | }; 284 | 285 | // Respond. 286 | response.unwrap().into_bytes() 287 | } 288 | 289 | /// `query()` is called on when a client's `.query()` is recieved. It does not go through the 290 | /// persistent log, it does not mutate the state of the state machine, and it is intended to be 291 | /// fast. 292 | fn query(&self, query: &[u8]) -> Vec { 293 | scoped_info!("Querying {:?}", String::from_utf8_lossy(query)); 294 | // Deserialize 295 | let string = String::from_utf8_lossy(query); 296 | let message = serde_json::from_str(&string).unwrap(); 297 | 298 | // Handle 299 | let response = match message { 300 | Get(key) => { 301 | let old_value = &self.map.get(&key).cloned(); 302 | serde_json::to_string(old_value) 303 | }, 304 | _ => panic!("Can't do mutating requests in query"), 305 | }; 306 | 307 | // Respond. 308 | response.unwrap().into_bytes() 309 | } 310 | 311 | fn snapshot(&self) -> Vec { 312 | serde_json::to_string(&self.map) 313 | .unwrap() 314 | .into_bytes() 315 | } 316 | 317 | fn restore_snapshot(&mut self, snapshot_value: Vec) { 318 | self.map = serde_json::from_str(&String::from_utf8_lossy(&snapshot_value)).unwrap(); 319 | () 320 | } 321 | } 322 | -------------------------------------------------------------------------------- /examples/register.rs: -------------------------------------------------------------------------------- 1 | extern crate bincode; 2 | extern crate docopt; 3 | extern crate env_logger; 4 | extern crate raft; 5 | extern crate rustc_serialize; 6 | extern crate serde; 7 | #[macro_use] extern crate serde_derive; 8 | 9 | use std::collections::HashMap; 10 | use std::net::{SocketAddr, ToSocketAddrs}; 11 | use std::path::Path; 12 | use std::process; 13 | 14 | use docopt::Docopt; 15 | 16 | use raft::{ 17 | state_machine, 18 | persistent_log, 19 | ServerId, 20 | Server, 21 | Client, 22 | }; 23 | 24 | /// Proposal operations supported by the distributed register. Proposals may 25 | /// mutate the register, and will be durably replicated to a quorum of peers 26 | /// before completing. 27 | #[derive(Serialize, Deserialize)] 28 | enum Proposal { 29 | Put(String), 30 | Cas(String, String), 31 | } 32 | 33 | /// Query operations supported by the distributed register. Queries may 34 | /// not mutate the register, and are serviced by the the current master replica. 35 | #[derive(Serialize, Deserialize)] 36 | enum Query { 37 | Get, 38 | } 39 | 40 | /// A response to a get, put or cas operation. 41 | #[derive(Serialize, Deserialize)] 42 | enum Response { 43 | /// The operation succeeded. 44 | Ok(String), 45 | /// The operation failed. 46 | Err(String), 47 | } 48 | 49 | static USAGE: &'static str = " 50 | A replicated mutable value. Operations on the register have serializable 51 | consistency, but no durability (once all register servers are terminated the 52 | value is lost). 53 | 54 | Each register server holds a replica of the register, and coordinates with its 55 | peers to update the register's value according to client commands. The register 56 | is available for reading and writing only if a majority of register servers are 57 | available. 58 | 59 | Commands: 60 | 61 | get Returns the current value of the register. 62 | 63 | put Sets the current value of the register, and returns the previous 64 | value. 65 | 66 | cas (compare and set) Conditionally sets the value of the register if the 67 | current value matches an expected value, and returns the previous 68 | value. 69 | 70 | server Starts a register server. Servers must be provided a unique ID and 71 | address (ip:port) at startup, along with the ID and address of all 72 | peer servers. 73 | 74 | Usage: 75 | register get ()... 76 | register put ()... 77 | register cas ()... 78 | register server [( )]... 79 | register (-h | --help) 80 | 81 | Options: 82 | -h --help Show a help message. 83 | "; 84 | 85 | #[derive(Debug, RustcDecodable)] 86 | struct Args { 87 | cmd_server: bool, 88 | cmd_get: bool, 89 | cmd_put: bool, 90 | cmd_cas: bool, 91 | 92 | arg_id: Option, 93 | arg_node_id: Vec, 94 | arg_node_address: Vec, 95 | arg_server_id: Option, 96 | 97 | arg_new_value: String, 98 | arg_expected_value: String, 99 | } 100 | 101 | fn main() { 102 | let _ = env_logger::init(); 103 | let args: Args = Docopt::new(USAGE) 104 | .and_then(|d| d.decode()) 105 | .unwrap_or_else(|e| e.exit()); 106 | if args.cmd_server { 107 | server(&args); 108 | } else if args.cmd_get { 109 | get(&args); 110 | } else if args.cmd_put { 111 | put(&args); 112 | } else if args.cmd_cas { 113 | cas(&args); 114 | } 115 | } 116 | 117 | /// Parses a socket address from a string, or panics with an error message. 118 | fn parse_addr(addr: &str) -> SocketAddr { 119 | addr.to_socket_addrs() 120 | .expect(&format!("unable to parse socket address: {}", addr)) 121 | .next() 122 | .unwrap() 123 | } 124 | 125 | /// Creates a new client connection to the raft servers specified in the arguments. 126 | fn create_client(args: &Args) -> Client { 127 | // Parse raft server addresses from arguments. 128 | let cluster = args.arg_node_address.iter() 129 | .map(|v| parse_addr(v)) 130 | .collect(); 131 | 132 | Client::new(cluster) 133 | } 134 | 135 | /// Handles a response message by printing the value on success, or printing the 136 | /// error and exiting on failure. 137 | fn handle_response(response: &[u8]) { 138 | match bincode::deserialize(response).unwrap() { 139 | Response::Ok(val) => println!("{}", val), 140 | Response::Err(err) => { 141 | println!("{}", err); 142 | process::exit(1); 143 | } 144 | } 145 | } 146 | 147 | /// Creates a raft server running on the current thread with options provided by `args`. 148 | fn server(args: &Args) { 149 | // Creating a raft server requires several things: 150 | 151 | // A log implementation, which manages the persistent, replicated log. 152 | 153 | // A state machine implementation. The state machine type must be the same 154 | // on all nodes. 155 | let state_machine = RegisterStateMachine::new(); 156 | 157 | // A unique server id. 158 | let id = ServerId::from(args.arg_id.unwrap()); 159 | 160 | let log = persistent_log::FsLog::new(Path::new(&format!("/tmp/register-raftlog.{}", id.as_u64()))).unwrap(); 161 | 162 | // A list of peers. 163 | let mut peers = args.arg_node_id 164 | .iter() 165 | .zip(args.arg_node_address.iter()) 166 | .map(|(&id, addr)| (ServerId::from(id), parse_addr(addr))) 167 | .collect::>(); 168 | 169 | // The peer set must not include the local server's ID. 170 | let addr = peers.remove(&id).unwrap(); 171 | 172 | // Run the raft server. 173 | Server::new(id, addr, log, state_machine) 174 | .with_election_min_millis(150) 175 | .with_election_max_millis(300) 176 | .with_heartbeat_millis(60) 177 | .with_peers(peers) 178 | .run() 179 | .unwrap(); 180 | } 181 | 182 | /// Retrieves the value of the register from the provided raft cluster. 183 | /// 184 | /// Panics if the get fails. 185 | fn get(args: &Args) { 186 | let mut client = create_client(args); 187 | let request = bincode::serialize(&Query::Get, bincode::Infinite).unwrap(); 188 | handle_response(&client.query(&request).unwrap()); 189 | } 190 | 191 | /// Sets a value for a given key in the provided raft cluster. 192 | fn put(args: &Args) { 193 | let mut client = create_client(args); 194 | let proposal = Proposal::Put(args.arg_new_value.clone()); 195 | let request = bincode::serialize(&proposal, bincode::Infinite).unwrap(); 196 | handle_response(&client.propose(&request).unwrap()); 197 | } 198 | 199 | /// Atomically sets the register value if the current value equals the expected 200 | /// value. 201 | fn cas(args: &Args) { 202 | let mut client = create_client(args); 203 | let proposal = Proposal::Cas(args.arg_expected_value.clone(), 204 | args.arg_new_value.clone()); 205 | let request = bincode::serialize(&proposal, bincode::Infinite).unwrap(); 206 | handle_response(&client.propose(&request).unwrap()); 207 | } 208 | 209 | /// A state machine that holds a single mutable string value. 210 | #[derive(Debug, Default)] 211 | pub struct RegisterStateMachine { 212 | value: String, 213 | } 214 | 215 | impl RegisterStateMachine { 216 | 217 | /// Creates a new register state machine with empty state. 218 | pub fn new() -> RegisterStateMachine { 219 | RegisterStateMachine { value: String::new() } 220 | } 221 | } 222 | 223 | /// `StateMachine` implementation that provides register semantics. 224 | /// 225 | /// The register is mutated by calls to `apply`, and queried by calls to 226 | /// `query`. 227 | impl state_machine::StateMachine for RegisterStateMachine { 228 | 229 | fn apply(&mut self, proposal: &[u8]) -> Vec { 230 | 231 | let message = match bincode::deserialize::(proposal) { 232 | Ok(proposal) => proposal, 233 | Err(err) => return format!("{}", err).into_bytes(), 234 | }; 235 | 236 | // Encoding the current value should never fail. 237 | let response = bincode::serialize(&Response::Ok(self.value.clone()), 238 | bincode::Infinite).unwrap(); 239 | match message { 240 | Proposal::Put(val) => self.value = val, 241 | Proposal::Cas(test, new) => { 242 | if test == self.value { 243 | self.value = new; 244 | } 245 | }, 246 | } 247 | 248 | response 249 | } 250 | 251 | fn query(&self, query: &[u8]) -> Vec { 252 | if let Err(err) = bincode::deserialize::(query) { 253 | return format!("{}", err).into_bytes(); 254 | } 255 | 256 | // Encoding the current value should never fail. 257 | bincode::serialize(&Response::Ok(self.value.clone()), 258 | bincode::Infinite).unwrap() 259 | } 260 | 261 | fn snapshot(&self) -> Vec { 262 | self.value.clone().into_bytes() 263 | } 264 | 265 | fn restore_snapshot(&mut self, value: Vec) { 266 | self.value = String::from_utf8(value).unwrap(); 267 | } 268 | } 269 | -------------------------------------------------------------------------------- /experiments/playbooks/install.yml: -------------------------------------------------------------------------------- 1 | # Installs on an Ubuntu host 2 | - hosts: all 3 | tasks: 4 | - name: Add `ppa:hansjorg/rust` PPA 5 | apt_repository: repo='ppa:hansjorg/rust' 6 | - name: Install requirements. 7 | apt: 8 | pkg: "{{ item }}" 9 | state: present 10 | with_items: 11 | - rust-nightly 12 | - cargo-nightly 13 | - git 14 | # For dashboard. 15 | - mosh 16 | # For Cap'n Proto 17 | - subversion 18 | - gcc-4.8 19 | - g++-4.8 20 | - build-essential 21 | - autoconf 22 | - automake 23 | - libtool 24 | - name: Clone Cap'n Proto. 25 | git: 26 | repo: https://github.com/kentonv/capnproto.git 27 | version: release-0.5.3 28 | dest: /capnproto 29 | update: yes 30 | accept_hostkey: true 31 | - name: Build Cap'n Proto 32 | shell: | 33 | ./setup-autotools.sh && 34 | autoreconf -i && 35 | ./configure --disable-shared && 36 | make -j5 && 37 | make install 38 | args: 39 | chdir: /capnproto/c++ 40 | environment: 41 | CXX: g++-4.8 42 | CC: gcc-4.8 43 | - name: Clone `raft` 44 | git: 45 | repo: https://github.com/Hoverbear/raft.git 46 | version: hostnames 47 | dest: /raft 48 | update: yes 49 | accept_hostkey: true 50 | - name: Build Raft 51 | command: cargo build --release --example hashmap 52 | args: 53 | chdir: /raft 54 | -------------------------------------------------------------------------------- /experiments/tmux/hashmap-local-3/cas: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | $BINARY cas $@ ${HOSTS} 3 | -------------------------------------------------------------------------------- /experiments/tmux/hashmap-local-3/dashboard: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | set -e 3 | 4 | export HOSTS="localhost:8080 localhost:8081 localhost:8082 localhost:8083" 5 | export EHOSTS="1 localhost:8081 2 localhost:8082 3 localhost:8083" 6 | export BINARY="../../../target/release/examples/hashmap" 7 | export RUST_LOG="raft=info,hashmap=debug" 8 | 9 | # cargo build --release --example hashmap 10 | 11 | tmux -L hashmap new -s hashmap -d -n "Server 1" 12 | tmux -L hashmap new-window -n "Server 2" 13 | tmux -L hashmap new-window -n "Server 3" 14 | tmux -L hashmap new-window -n "Command Seat" 15 | tmux -L hashmap select-window -t 1 16 | tmux -L hashmap -2 attach-session -t hashmap 17 | -------------------------------------------------------------------------------- /experiments/tmux/hashmap-local-3/get: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | $BINARY get $@ ${HOSTS} 3 | -------------------------------------------------------------------------------- /experiments/tmux/hashmap-local-3/put: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | $BINARY put $@ ${HOSTS} 3 | -------------------------------------------------------------------------------- /experiments/tmux/hashmap-local-3/server: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # echo $$ > /tmp/server$1 3 | $BINARY server $1 ${EHOSTS} 4 | -------------------------------------------------------------------------------- /raft.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hoverbear/old-raft-rs/76ea70840ee53a3cc272e5cdab055ddef01274fc/raft.png -------------------------------------------------------------------------------- /src/backoff.rs: -------------------------------------------------------------------------------- 1 | use std::cmp; 2 | 3 | use rand::{self, Rng}; 4 | 5 | /// A randomized exponential backoff policy for retrying operations. 6 | /// 7 | /// See [Exponential Backoff in Distributed Systems] 8 | /// (http://dthain.blogspot.com/2009/02/exponential-backoff-in-distributed.html) 9 | /// for algorithm details. 10 | pub struct Backoff { 11 | /// Initial backoff duration. 12 | initial: u32, 13 | 14 | /// Maximum backoff duration. 15 | max: u32, 16 | 17 | /// Number of retries since last reset. 18 | retries: u32, 19 | } 20 | 21 | impl Backoff { 22 | /// Creates a new exponential backoff policy with the provided initial 23 | /// and maximum duration in milliseconds. 24 | /// 25 | /// The initial duration should be set at the outer limits of expected 26 | /// response time for the service. For example, if your service responds in 27 | /// 1ms on average but in 10ms for 99% of requests, then set t=10. 28 | pub fn with_duration_range(initial: u32, max: u32) -> Backoff { 29 | assert!(initial > 0, "round-trip time must be greater than 0"); 30 | Backoff { 31 | initial: initial, 32 | max: max, 33 | retries: 0, 34 | } 35 | } 36 | 37 | /// Resets the backoff to the initial state. 38 | pub fn reset(&mut self) { 39 | self.retries = 0; 40 | } 41 | 42 | /// Retrieves the next backoff duration in milliseconds. 43 | pub fn next_backoff_ms(&mut self) -> u64 { 44 | // Prevent overflow by testing if the backoff will be greater than the 45 | // max in an arithmeticaly stable manner, and if so return the max. 46 | if (self.max as f64 / self.initial as f64).log2() < self.retries as f64 { 47 | return self.max as u64; 48 | } 49 | 50 | let rand = rand::thread_rng().gen_range::(1.0, 2.0); 51 | let duration = ((self.initial as u64 * 2u64.pow(self.retries)) as f64 * rand) as u64; 52 | let ms = cmp::min(self.max as u64, duration); 53 | self.retries += 1; 54 | ms 55 | } 56 | } 57 | 58 | #[cfg(test)] 59 | mod tests { 60 | 61 | use super::*; 62 | 63 | #[test] 64 | fn test_exponential_backoff() { 65 | let mut backoff = Backoff::with_duration_range(1, 18); 66 | 67 | let a = backoff.next_backoff_ms(); 68 | assert!(a >= 1 && a < 2); 69 | 70 | let b = backoff.next_backoff_ms(); 71 | assert!(b >= 2 && b < 4); 72 | 73 | let c = backoff.next_backoff_ms(); 74 | assert!(c >= 4 && c < 8); 75 | 76 | let d = backoff.next_backoff_ms(); 77 | assert!(d >= 8 && d < 16); 78 | 79 | let e = backoff.next_backoff_ms(); 80 | assert!(e >= 16 && e <= 18); 81 | 82 | let f = backoff.next_backoff_ms(); 83 | assert!(f >= 18 && e <= 18); 84 | 85 | backoff.reset(); 86 | 87 | let g = backoff.next_backoff_ms(); 88 | assert!(g >= 1 && g < 2); 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/client.rs: -------------------------------------------------------------------------------- 1 | //! The `Client` allows users of the `raft` library to connect to remote `Server` instances and 2 | //! issue commands to be applied to the `StateMachine`. 3 | 4 | use std::collections::HashSet; 5 | use std::fmt; 6 | use std::io::Write; 7 | use std::time::Duration; 8 | use std::net::SocketAddr; 9 | use std::net::TcpStream; 10 | use std::str::FromStr; 11 | 12 | use bufstream::BufStream; 13 | use capnp::serialize; 14 | use capnp::message::{Allocator, Builder, ReaderOptions}; 15 | 16 | use messages_capnp::{client_response, command_response}; 17 | use messages; 18 | use ClientId; 19 | use Result; 20 | use RaftError; 21 | 22 | const CLIENT_TIMEOUT: u64 = 1500; 23 | 24 | /// The representation of a Client connection to the cluster. 25 | pub struct Client { 26 | /// The `Uuid` of the client, should be unique in the cluster. 27 | pub id: ClientId, 28 | /// The current connection to the current leader. 29 | /// If it is `None`, there may be no established leader, or a connection 30 | /// issue. 31 | leader_connection: Option>, 32 | /// A lookup for the cluster's nodes. 33 | cluster: HashSet, 34 | } 35 | 36 | impl Client { 37 | /// Creates a new client. 38 | pub fn new(cluster: HashSet) -> Client { 39 | Client { 40 | id: ClientId::new(), 41 | leader_connection: None, 42 | cluster: cluster, 43 | } 44 | } 45 | 46 | /// Proposes an entry to be appended to the replicated log. This will only 47 | /// return once the entry has been durably committed. 48 | /// Returns `Error` when the entire cluster has an unknown leader. Try proposing again later. 49 | pub fn propose(&mut self, entry: &[u8]) -> Result> { 50 | scoped_trace!("{:?}: propose", self); 51 | let mut message = messages::proposal_request(entry); 52 | self.send_message(&mut message) 53 | } 54 | 55 | /// Queries an entry from the state machine. This is non-mutating and doesn't go through the 56 | /// durable log. Like `.propose()` this will only communicate with the leader of the cluster. 57 | pub fn query(&mut self, query: &[u8]) -> Result> { 58 | scoped_trace!("{:?}: query", self); 59 | let mut message = messages::query_request(query); 60 | self.send_message(&mut message) 61 | } 62 | 63 | fn send_message(&mut self, message: &mut Builder) -> Result> 64 | where A: Allocator 65 | { 66 | let mut members = self.cluster.iter().cloned(); 67 | 68 | loop { 69 | // We presume in this loop that most errors are temporary and it may take a redirect 70 | // (or more!) to find a leader in bad network conditions. 71 | // TODO: Have timouts. 72 | let mut connection = match self.leader_connection.take() { 73 | Some(cxn) => { 74 | scoped_debug!("had existing connection {:?}", cxn.get_ref().peer_addr()); 75 | cxn 76 | } 77 | None => { 78 | let leader = try!(members.next().ok_or(RaftError::LeaderSearchExhausted)); 79 | scoped_debug!("connecting to potential leader {}", leader); 80 | // Send the preamble. 81 | let preamble = messages::client_connection_preamble(self.id); 82 | let mut stream = match TcpStream::connect(leader) { 83 | Ok(stream) => { 84 | let timeout = Some(Duration::from_millis(CLIENT_TIMEOUT)); 85 | if stream.set_read_timeout(timeout).is_err() { 86 | continue; 87 | } 88 | BufStream::new(stream) 89 | } 90 | Err(_) => continue, 91 | }; 92 | scoped_debug!("connected"); 93 | if serialize::write_message(&mut stream, &*preamble).is_err() { 94 | continue; 95 | }; 96 | stream 97 | } 98 | }; 99 | if serialize::write_message(&mut connection, message).is_err() { 100 | continue; 101 | }; 102 | if connection.flush().is_err() { 103 | continue; 104 | }; 105 | scoped_debug!("awaiting response from connection"); 106 | let response = match serialize::read_message(&mut connection, ReaderOptions::new()) { 107 | Ok(res) => res, 108 | Err(_) => continue, 109 | }; 110 | let reader = match response.get_root::() { 111 | Ok(reader) => reader, 112 | Err(_) => continue, 113 | }; 114 | match reader.which() { 115 | Ok(client_response::Which::Proposal(Ok(status))) => { 116 | match status.which() { 117 | Ok(command_response::Which::Success(data)) => { 118 | scoped_debug!("received response Success"); 119 | self.leader_connection = Some(connection); 120 | return data.map(Vec::from) 121 | .map_err(|e| e.into()); // Exit the function. 122 | } 123 | Ok(command_response::Which::UnknownLeader(())) => { 124 | scoped_debug!("received response UnknownLeader"); 125 | () // Keep looping. 126 | } 127 | Ok(command_response::Which::NotLeader(leader)) => { 128 | scoped_debug!("received response NotLeader"); 129 | let leader_str = try!(leader); 130 | if !self.cluster.contains(&try!(SocketAddr::from_str(leader_str))) { 131 | scoped_debug!("cluster violation detected"); 132 | return Err(RaftError::ClusterViolation.into()); // Exit the function. 133 | } 134 | let mut connection: TcpStream = try!(TcpStream::connect(leader_str)); 135 | let preamble = messages::client_connection_preamble(self.id); 136 | if serialize::write_message(&mut connection, &*preamble).is_err() { 137 | continue; 138 | }; 139 | self.leader_connection = Some(BufStream::new(connection)); 140 | } 141 | Err(_) => continue, 142 | } 143 | } 144 | _ => panic!("Unexpected message type"), // TODO: return a proper error 145 | }; 146 | } 147 | } 148 | } 149 | 150 | impl fmt::Debug for Client { 151 | fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { 152 | write!(fmt, "Client({})", self.id) 153 | } 154 | } 155 | 156 | 157 | #[cfg(test)] 158 | mod tests { 159 | extern crate env_logger; 160 | 161 | use std::collections::HashSet; 162 | use std::io::Write; 163 | use std::net::{TcpStream, TcpListener}; 164 | use std::thread; 165 | 166 | use uuid::Uuid; 167 | use capnp::serialize; 168 | use capnp::message::ReaderOptions; 169 | use bufstream::BufStream; 170 | 171 | use {Client, messages, Result}; 172 | use messages_capnp::{connection_preamble, client_request}; 173 | 174 | fn expect_preamble(connection: &mut TcpStream, client_id: Uuid) -> Result { 175 | let message = try!(serialize::read_message(connection, ReaderOptions::new())); 176 | let preamble = try!(message.get_root::()); 177 | // Test to make sure preamble has the right id. 178 | if let connection_preamble::id::Which::Client(Ok(id)) = try!(preamble.get_id().which()) { 179 | Ok(Uuid::from_bytes(id).unwrap() == client_id) 180 | } else { 181 | Ok(false) 182 | } 183 | } 184 | 185 | fn expect_proposal(connection: &mut TcpStream, value: &[u8]) -> Result { 186 | let message = try!(serialize::read_message(connection, ReaderOptions::new())); 187 | let request = try!(message.get_root::()); 188 | // Test to make sure request has the right value. 189 | if let client_request::Which::Proposal(Ok(proposal)) = try!(request.which()) { 190 | Ok(proposal.get_entry().unwrap() == value) 191 | } else { 192 | Ok(false) 193 | } 194 | } 195 | 196 | #[test] 197 | fn test_proposal_success() { 198 | setup_test!("test_proposal_success"); 199 | // Start up the cluster and get what we need. 200 | let mut cluster = HashSet::new(); 201 | let test_server = TcpListener::bind("127.0.0.1:0").unwrap(); 202 | let test_addr = test_server.local_addr().unwrap(); 203 | cluster.insert(test_addr); 204 | 205 | let mut client = Client::new(cluster); 206 | let client_id = client.id.0.clone(); 207 | let to_propose = b"Bears"; 208 | 209 | // The client connects on the proposal. 210 | // Wait for it. 211 | let child = thread::spawn(move || { 212 | let (mut connection, _) = test_server.accept().unwrap(); 213 | 214 | // Proposal should be fine, no errors. 215 | scoped_debug!("Should get preamble and proposal. Responds Success"); 216 | expect_preamble(&mut connection, client_id).unwrap(); 217 | expect_proposal(&mut connection, to_propose).unwrap(); 218 | // Send response! (success!) 219 | let response = messages::command_response_success(b"Foxes"); 220 | serialize::write_message(&mut connection, &*response).unwrap(); 221 | connection.flush().unwrap(); 222 | }); 223 | 224 | // Propose. It's a marriage made in heaven! :) 225 | // Should be ok 226 | assert_eq!(client.propose(to_propose).unwrap(), b"Foxes"); 227 | assert!(client.leader_connection.is_some()); 228 | 229 | child.join().unwrap(); 230 | } 231 | 232 | #[test] 233 | fn test_proposal_unknown_leader() { 234 | setup_test!("test_proposal_unknown_leader"); 235 | // Start up the cluster and get what we need. 236 | let mut cluster = HashSet::new(); 237 | let test_server = TcpListener::bind("127.0.0.1:0").unwrap(); 238 | let test_addr = test_server.local_addr().unwrap(); 239 | cluster.insert(test_addr); 240 | 241 | let mut client = Client::new(cluster); 242 | let to_propose = b"Bears"; 243 | 244 | // The client connects on the proposal. 245 | // Wait for it. 246 | let child = thread::spawn(move || { 247 | let (mut connection, _) = test_server.accept().unwrap(); 248 | 249 | // Proposal should report unknown leader, and have the client return error. 250 | scoped_debug!("Should get proposal. Responds UnknownLeader"); 251 | expect_proposal(&mut connection, to_propose).unwrap(); 252 | // Send response! (unknown leader!) Client should drop connection. 253 | let response = messages::command_response_unknown_leader(); 254 | serialize::write_message(&mut connection, &*response).unwrap(); 255 | connection.flush().unwrap(); 256 | }); 257 | 258 | // Propose. It's a marriage made in heaven! :) 259 | assert!(client.propose(to_propose).is_err()); 260 | 261 | child.join().unwrap(); 262 | } 263 | 264 | #[test] 265 | fn test_proposal_not_leader() { 266 | setup_test!("test_proposal_not_leader"); 267 | let mut cluster = HashSet::new(); 268 | let test_server = TcpListener::bind("127.0.0.1:0").unwrap(); 269 | let test_addr = test_server.local_addr().unwrap(); 270 | cluster.insert(test_addr); 271 | 272 | let second_server = TcpListener::bind("127.0.0.1:0").unwrap(); 273 | let second_addr = second_server.local_addr().unwrap(); 274 | cluster.insert(second_addr); 275 | 276 | let mut client = Client::new(cluster); 277 | let client_id = client.id.0.clone(); 278 | let to_propose = b"Bears"; 279 | 280 | // The client connects on the first proposal. 281 | // Wait for it. 282 | let child = thread::spawn(move || { 283 | // Proposal should report NotLeader. Client should choose the server we direct it to. 284 | scoped_debug!("Should get preamble and proposal. Responds NotLeader."); 285 | let (mut connection, _) = test_server.accept().unwrap(); 286 | expect_preamble(&mut connection, client_id).unwrap(); 287 | expect_proposal(&mut connection, to_propose).unwrap(); 288 | 289 | // Send response! (not leader!) 290 | let response = messages::command_response_not_leader(&second_addr); 291 | serialize::write_message(&mut connection, &*response).unwrap(); 292 | connection.flush().unwrap(); 293 | 294 | // Test that it seeks out other server and proposes. 295 | scoped_debug!("Second server should get preamble and proposal. Responds Success."); 296 | let (mut connection, _) = second_server.accept().unwrap(); 297 | expect_preamble(&mut connection, client_id).unwrap(); 298 | expect_proposal(&mut connection, to_propose).unwrap(); 299 | 300 | // Send final response! (Success!) 301 | let response = messages::command_response_success(b"Foxes"); 302 | serialize::write_message(&mut connection, &*response).unwrap(); 303 | }); 304 | 305 | // Workaround to set up rigged selection of servers. 306 | client.leader_connection = { 307 | let preamble = messages::client_connection_preamble(client.id); 308 | let mut stream = BufStream::new(TcpStream::connect(test_addr).unwrap()); 309 | serialize::write_message(&mut stream, &*preamble).unwrap(); 310 | Some(stream) 311 | }; 312 | 313 | // Should be ok, change leader connection. 314 | assert_eq!(client.propose(to_propose).unwrap(), b"Foxes"); 315 | assert!(client.leader_connection.is_some()); 316 | 317 | child.join().unwrap(); 318 | } 319 | 320 | /// This test makes sure that the client cannot be redirected to a leader which exists outside 321 | /// the cluster. This is a necessary test since it would introduce error into the cluster. 322 | #[test] 323 | fn test_proposal_leader_not_in_cluster() { 324 | setup_test!("test_proposal_leader_not_in_cluster"); 325 | let mut cluster = HashSet::new(); 326 | let test_server = TcpListener::bind("127.0.0.1:0").unwrap(); 327 | let test_addr = test_server.local_addr().unwrap(); 328 | cluster.insert(test_addr); 329 | 330 | let second_server = TcpListener::bind("127.0.0.1:0").unwrap(); 331 | let second_addr = second_server.local_addr().unwrap(); 332 | // cluster.insert(second_addr); <--- NOT in cluster. 333 | 334 | let mut client = Client::new(cluster); 335 | let client_id = client.id.0.clone(); 336 | let to_propose = b"Bears"; 337 | 338 | // The client connects on the first proposal. 339 | // Wait for it. 340 | let child = thread::spawn(move || { 341 | // Proposal should report NotLeader. Client should choose the server we direct it to. 342 | scoped_debug!("Should get preamble and proposal. Responds NotLeader."); 343 | let (mut connection, _) = test_server.accept().unwrap(); 344 | expect_preamble(&mut connection, client_id).unwrap(); 345 | expect_proposal(&mut connection, to_propose).unwrap(); 346 | 347 | // Send response! (not leader!) 348 | let response = messages::command_response_not_leader(&second_addr); 349 | serialize::write_message(&mut connection, &*response).unwrap(); 350 | connection.flush().unwrap(); 351 | 352 | // No more... 353 | }); 354 | 355 | // Workaround to set up rigged selection of servers. 356 | client.leader_connection = { 357 | let preamble = messages::client_connection_preamble(client.id); 358 | let mut stream = BufStream::new(TcpStream::connect(test_addr).unwrap()); 359 | serialize::write_message(&mut stream, &*preamble).unwrap(); 360 | Some(stream) 361 | }; 362 | 363 | // Should be err, change leader connection but to wrong ip.. 364 | assert!(client.propose(to_propose).is_err()); 365 | assert!(client.leader_connection.is_none()); 366 | 367 | child.join().unwrap(); 368 | } 369 | } 370 | -------------------------------------------------------------------------------- /src/connection.rs: -------------------------------------------------------------------------------- 1 | use std::fmt; 2 | use std::net::SocketAddr; 3 | use std::rc::Rc; 4 | 5 | use mio::tcp::TcpStream; 6 | use mio::Timeout as TimeoutHandle; 7 | use mio::{EventLoop, EventSet, PollOpt, Token}; 8 | use capnp::message::{Builder, HeapAllocator, Reader, ReaderOptions}; 9 | use capnp_nonblock::{MessageStream, Segments}; 10 | 11 | use ClientId; 12 | use Result; 13 | use ServerId; 14 | use backoff::Backoff; 15 | use messages; 16 | use server::{Server, ServerTimeout}; 17 | use state_machine::StateMachine; 18 | use persistent_log::Log; 19 | 20 | fn poll_opt() -> PollOpt { 21 | PollOpt::edge() | PollOpt::oneshot() 22 | } 23 | 24 | /// The type of a connection. 25 | #[derive(Copy, Clone, Debug, Hash, PartialEq, Eq)] 26 | pub enum ConnectionKind { 27 | /// A peer in the cluster. 28 | Peer(ServerId), 29 | /// A client which is asking the Raft cluster to do things. 30 | Client(ClientId), 31 | /// Something else. 32 | Unknown, 33 | } 34 | 35 | impl ConnectionKind { 36 | /// Returns if the `Connection` is a peer type. 37 | fn is_peer(&self) -> bool { 38 | match *self { 39 | ConnectionKind::Peer(..) => true, 40 | _ => false, 41 | } 42 | } 43 | } 44 | 45 | pub struct Connection { 46 | kind: ConnectionKind, 47 | /// The address to reconnect to - for a connection initiated by the remote, 48 | /// this is not the remote address. 49 | addr: SocketAddr, 50 | stream: Option>>>, 51 | backoff: Backoff, 52 | } 53 | 54 | impl Connection { 55 | /// Creates a new `Connection` wrapping the provided socket stream. 56 | /// 57 | /// The socket must already be connected. 58 | /// 59 | /// Note: the caller must manually set the token field after inserting the 60 | /// connection into a slab. 61 | pub fn unknown(socket: TcpStream) -> Result { 62 | let addr = try!(socket.peer_addr()); 63 | Ok(Connection { 64 | kind: ConnectionKind::Unknown, 65 | addr: addr, 66 | stream: Some(MessageStream::new(socket, ReaderOptions::new())), 67 | backoff: Backoff::with_duration_range(50, 10000), 68 | }) 69 | } 70 | 71 | /// Creates a new peer connection. 72 | pub fn peer(id: ServerId, addr: SocketAddr) -> Result { 73 | let stream = try!(TcpStream::connect(&addr)); 74 | Ok(Connection { 75 | kind: ConnectionKind::Peer(id), 76 | addr: addr, 77 | stream: Some(MessageStream::new(stream, ReaderOptions::new())), 78 | backoff: Backoff::with_duration_range(50, 10000), 79 | }) 80 | } 81 | 82 | pub fn kind(&self) -> &ConnectionKind { 83 | &self.kind 84 | } 85 | 86 | pub fn set_kind(&mut self, kind: ConnectionKind) { 87 | self.kind = kind; 88 | } 89 | 90 | pub fn addr(&self) -> &SocketAddr { 91 | &self.addr 92 | } 93 | 94 | pub fn set_addr(&mut self, addr: SocketAddr) { 95 | self.addr = addr; 96 | } 97 | 98 | /// Returns the connection's stream. 99 | /// Must only be called while the connection is active. 100 | fn stream(&self) -> &MessageStream>> { 101 | match self.stream { 102 | Some(ref stream) => stream, 103 | None => panic!(format!("{:?}: not connected", self)), 104 | } 105 | } 106 | 107 | /// Returns the connection's mutable stream. 108 | /// Must only be called while the connection is active. 109 | fn stream_mut(&mut self) 110 | -> &mut MessageStream>> { 111 | match self.stream { 112 | Some(ref mut stream) => stream, 113 | None => panic!(format!("{:?}: not connected", self)), 114 | } 115 | } 116 | 117 | /// Writes queued messages to the socket. 118 | pub fn writable(&mut self) -> Result<()> { 119 | scoped_trace!("{:?}: writable", self); 120 | if let Connection { stream: Some(ref mut stream), ref mut backoff, .. } = *self { 121 | try!(stream.write()); 122 | backoff.reset(); 123 | Ok(()) 124 | } else { 125 | panic!("{:?}: writable event while not connected", self); 126 | } 127 | } 128 | 129 | /// Reads a message from the connection's stream, or if a full message is 130 | /// not available, returns `None`. 131 | /// 132 | /// Connections are edge-triggered, so the handler must continue calling 133 | /// until no more messages are returned. 134 | pub fn readable(&mut self) -> Result>> { 135 | scoped_trace!("{:?}: readable", self); 136 | self.stream_mut().read_message().map_err(From::from) 137 | } 138 | 139 | /// Queues a message to send to the connection. Returns `true` if the connection should be 140 | /// reregistered with the event loop. 141 | pub fn send_message(&mut self, message: Rc>) -> Result { 142 | scoped_trace!("{:?}: send_message", self); 143 | match self.stream { 144 | Some(ref mut stream) => { 145 | // Reregister if the connection is not already registered, and 146 | // there are still messages left to send. MessageStream 147 | // optimistically sends messages, so it's likely that small 148 | // messages can be sent without ever registering. 149 | let unregistered = stream.outbound_queue_len() == 0; 150 | try!(stream.write_message(message)); 151 | Ok(unregistered && stream.outbound_queue_len() > 0) 152 | } 153 | None => Ok(false), 154 | } 155 | } 156 | 157 | fn events(&self) -> EventSet { 158 | let mut events = EventSet::all(); 159 | if self.stream().outbound_queue_len() == 0 { 160 | events = events - EventSet::writable(); 161 | } 162 | events 163 | } 164 | 165 | /// Registers the connection with the event loop. 166 | pub fn register(&mut self, 167 | event_loop: &mut EventLoop>, 168 | token: Token) 169 | -> Result<()> 170 | where L: Log, 171 | M: StateMachine 172 | { 173 | scoped_trace!("{:?}: register", self); 174 | event_loop.register(self.stream().inner(), token, self.events(), poll_opt()) 175 | .map_err(|error| { 176 | scoped_warn!("{:?}: reregister failed: {}", self, error); 177 | From::from(error) 178 | }) 179 | } 180 | 181 | /// Reregisters the connection with the event loop. 182 | pub fn reregister(&mut self, 183 | event_loop: &mut EventLoop>, 184 | token: Token) 185 | -> Result<()> 186 | where L: Log, 187 | M: StateMachine 188 | { 189 | scoped_trace!("{:?}: reregister", self); 190 | event_loop.reregister(self.stream().inner(), token, self.events(), poll_opt()) 191 | .map_err(|error| { 192 | scoped_warn!("{:?}: register failed: {}", self, error); 193 | From::from(error) 194 | }) 195 | } 196 | 197 | /// Reconnects to the given peer ID and sends the preamble, advertising the 198 | /// given local address to the peer. 199 | pub fn reconnect_peer(&mut self, id: ServerId, local_addr: &SocketAddr) -> Result<()> { 200 | scoped_assert!(self.kind.is_peer()); 201 | scoped_trace!("{:?}: reconnect", self); 202 | self.stream = Some(MessageStream::new(try!(TcpStream::connect(&self.addr)), 203 | ReaderOptions::new())); 204 | try!(self.send_message(messages::server_connection_preamble(id, local_addr))); 205 | Ok(()) 206 | } 207 | 208 | /// Resets a peer connection. 209 | pub fn reset_peer(&mut self, 210 | event_loop: &mut EventLoop>, 211 | token: Token) 212 | -> Result<(ServerTimeout, TimeoutHandle)> 213 | where L: Log, 214 | M: StateMachine 215 | { 216 | scoped_assert!(self.kind.is_peer()); 217 | self.stream = None; 218 | let duration = self.backoff.next_backoff_ms(); 219 | let timeout = ServerTimeout::Reconnect(token); 220 | let handle = event_loop.timeout_ms(timeout, duration).unwrap(); 221 | 222 | scoped_info!("{:?}: reset, will attempt to reconnect in {}ms", 223 | self, 224 | duration); 225 | Ok((timeout, handle)) 226 | } 227 | 228 | pub fn clear_messages(&mut self) { 229 | if let Some(ref mut stream) = self.stream { 230 | stream.clear_outbound_queue(); 231 | } 232 | } 233 | } 234 | 235 | impl fmt::Debug for Connection { 236 | fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { 237 | match self.kind { 238 | ConnectionKind::Peer(id) => write!(fmt, "PeerConnection({})", id), 239 | ConnectionKind::Client(id) => write!(fmt, "ClientConnection({})", id), 240 | ConnectionKind::Unknown => write!(fmt, "UnknownConnection({})", &self.addr), 241 | } 242 | } 243 | } 244 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![crate_name = "raft"] 2 | #![crate_type="lib"] 3 | #![doc(html_logo_url = "https://raw.githubusercontent.com/Hoverbear/raft/master/raft.png")] 4 | #![doc(html_root_url = "https://hoverbear.github.io/raft/raft/")] 5 | #![cfg_attr(feature = "cargo-clippy", allow(doc_markdown))] 6 | 7 | //! This is the Raft Distributed Consensus Protocol implemented for Rust. 8 | //! [Raft](http://raftconsensus.github.io/) is described as: 9 | //! 10 | //! > Raft is a consensus algorithm that is designed to be easy to understand. It's equivalent to 11 | //! > Paxos in fault-tolerance and performance. The difference is that it's decomposed into 12 | //! > relatively independent subproblems, and it cleanly addresses all major pieces needed for 13 | //! > practical systems. 14 | //! 15 | //! This implementation utilizes [Cap'n Proto](https://kentonv.github.io/capnproto/) for its RPC, 16 | //! [`mio`](https://github.com/carllerche/mio) for it's async event loop. 17 | //! 18 | //! If this package fails to build for you it is possibly because you do not have the 19 | //! [`capnp`](https://capnproto.org/capnp-tool.html) utility installed. You should be able to find 20 | //! appropriate packages for most popular distributions. 21 | //! 22 | //! # Consuming this library 23 | //! 24 | //! Consuming this library works in a few parts: 25 | //! 26 | //! 1. Implement or consume a Persistent Log and a State Machine such that they will hook into 27 | //! your application desirably. 28 | //! 2. Create a `Server` with those implementations. It will independently fire up and join the 29 | //! cluster. 30 | //! 3. Interact with the cluster by issuing `.propose()` and `.query()` calls via the `Client` 31 | //! 4. React to calls to `.propose()` and `.query()` from the implemented `StateMachine` 32 | //! 33 | //! ## Persistent Log 34 | //! 35 | //! A `Log` represents the **replicated, persistent log** of your application. It has a 36 | //! strong ordering such that `A → B → C` and should **only** act to store information. Entries 37 | //! placed into the log should not be acted on in any way by the consuming application as they 38 | //! have not been committed to the cluster. 39 | //! 40 | //! Some ideas for a Persistent Log implementation: 41 | //! 42 | //! * A PostgreSQL / SQLite instance. 43 | //! * A plain old file. 44 | //! * A vector in memory *(Note: Log compaction is still pending, so be aware of running out!)* 45 | //! 46 | //! > It is our belief that in many cases the implementation of `Log` will be generic to 47 | //! > application purposes. You are encouraged to submit your own implementations to us! 48 | //! 49 | //! ## State Machine 50 | //! 51 | //! The `StateMachine` represents the **stateful representation** of your application. Events 52 | //! are applied to the `StateMachine` in the correct ordering at the time of commit. This is where 53 | //! your application **should** act on information. 54 | //! 55 | //! In the `StateMachine` there are both mutable (`.apply()`) and immutable (`.query()`) calls. 56 | //! There is a considerable performance difference, as `.query()` calls do not pass through the 57 | //! durable `Log` while `.apply()` events do. 58 | //! 59 | //! Some ideas for a State Machine implementation: 60 | //! 61 | //! * A Hashmap or key-value store (Example provided) 62 | //! * A single register (Example provided) 63 | //! * Basically anything from `std::collections` 64 | //! 65 | //! ## Client Requests 66 | //! 67 | //! Client requests are the **only** way to interact with the Raft cluster. Calls to `.propose()` 68 | //! and `.query()` are automatically routed to the relevant `Leader` node and behave as blocking 69 | //! calls. 70 | //! 71 | //! This means `.propose()` won't return until the entry is durably replicated into the log of at 72 | //! least the majority of the cluster and has been commited. `.query()` will perform better if 73 | //! you wish to only read data and not have it pass through the persisted log. 74 | //! 75 | 76 | #![cfg_attr(test, feature(test))] 77 | extern crate bufstream; 78 | extern crate byteorder; 79 | extern crate capnp; 80 | extern crate capnp_nonblock; 81 | extern crate mio; 82 | extern crate rand; 83 | extern crate uuid; 84 | #[macro_use] 85 | extern crate log; 86 | #[macro_use] 87 | extern crate scoped_log; 88 | #[macro_use] 89 | extern crate wrapped_enum; 90 | #[cfg(test)] 91 | extern crate env_logger; 92 | #[cfg(feature = "serde")] 93 | extern crate serde; 94 | 95 | /// Prepares the environment testing. Should be called as the first line of every test with the 96 | /// name of the test as the only argument. 97 | /// 98 | /// TODO: Make this an annotation like #[rust_test] instead of a macro. 99 | #[cfg(test)] 100 | macro_rules! setup_test { 101 | ($test_name:expr) => ( 102 | let _ = env_logger::init(); 103 | push_log_scope!($test_name); 104 | ); 105 | } 106 | 107 | pub mod state_machine; 108 | pub mod persistent_log; 109 | pub mod messages_capnp { 110 | #![allow(dead_code)] 111 | include!(concat!(env!("OUT_DIR"), "/messages_capnp.rs")); 112 | } 113 | 114 | mod backoff; 115 | mod client; 116 | mod connection; 117 | mod messages; 118 | mod consensus; 119 | mod server; 120 | mod state; 121 | 122 | pub use server::Server; 123 | pub use state_machine::StateMachine; 124 | pub use persistent_log::Log; 125 | pub use client::Client; 126 | 127 | use std::{io, net, ops, fmt}; 128 | 129 | use uuid::Uuid; 130 | 131 | /// A simple convienence type. 132 | pub type Result = std::result::Result; 133 | 134 | wrapped_enum!{ 135 | #[doc = "The generic `raft::Error` is composed of one of the errors that can originate from the"] 136 | #[doc = "various libraries consumed by the library."] 137 | #[doc = "With the exception of the `Raft` variant these are generated from `try!()` macros invoking"] 138 | #[doc = "on `io::Error` or `capnp::Error` by using"] 139 | #[doc = "[`FromError`](https://doc.rust-lang.org/std/error/#the-fromerror-trait)."] 140 | #[derive(Debug)] 141 | pub enum Error { 142 | /// An error originating from the [Cap'n Proto](https://github.com/dwrensha/capnproto-rust) library. 143 | CapnProto(capnp::Error), 144 | /// A specific error produced when a bad Cap'n proto message is discovered. 145 | SchemaError(capnp::NotInSchema), 146 | /// Errors originating from `std::io`. 147 | Io(io::Error), 148 | /// Raft specific errors. 149 | Raft(RaftError), 150 | /// Errors related to parsing addresses. 151 | AddrParse(net::AddrParseError), 152 | } 153 | } 154 | 155 | /// A Raft Error represents a Raft specific error that consuming code is expected to handle 156 | /// gracefully. 157 | #[derive(Debug)] 158 | pub enum RaftError { 159 | /// The server ran out of slots in the slab for new connections 160 | ConnectionLimitReached, 161 | /// A client reported an invalid client id 162 | InvalidClientId, 163 | /// A consensus module reported back a leader not in the cluster. 164 | ClusterViolation, 165 | /// A remote connection attempted to use an unknown connection type in the connection preamble 166 | UnknownConnectionType, 167 | /// An invalid peer in in the peer set. Returned Server::new(). 168 | InvalidPeerSet, 169 | /// Registering a connection failed 170 | ConnectionRegisterFailed, 171 | /// Failed to find a leader in the cluster. Try again later. 172 | LeaderSearchExhausted, 173 | } 174 | 175 | impl fmt::Display for Error { 176 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 177 | match *self { 178 | Error::CapnProto(ref error) => fmt::Display::fmt(error, f), 179 | Error::SchemaError(ref error) => fmt::Display::fmt(error, f), 180 | Error::Io(ref error) => fmt::Display::fmt(error, f), 181 | Error::Raft(ref error) => fmt::Debug::fmt(error, f), 182 | Error::AddrParse(ref error) => fmt::Debug::fmt(error, f), 183 | } 184 | } 185 | } 186 | 187 | /// The term of a log entry. 188 | #[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] 189 | pub struct Term(u64); 190 | impl Term { 191 | pub fn as_u64(self) -> u64 { 192 | self.0 193 | } 194 | } 195 | impl From for Term { 196 | fn from(val: u64) -> Term { 197 | Term(val) 198 | } 199 | } 200 | impl Into for Term { 201 | fn into(self) -> u64 { 202 | self.0 203 | } 204 | } 205 | impl ops::Add for Term { 206 | type Output = Term; 207 | fn add(self, rhs: u64) -> Term { 208 | Term(self.0.checked_add(rhs).expect("overflow while incrementing Term")) 209 | } 210 | } 211 | impl ops::Sub for Term { 212 | type Output = Term; 213 | fn sub(self, rhs: u64) -> Term { 214 | Term(self.0.checked_sub(rhs).expect("underflow while decrementing Term")) 215 | } 216 | } 217 | impl fmt::Display for Term { 218 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 219 | fmt::Display::fmt(&self.0, f) 220 | } 221 | } 222 | 223 | /// The index of a log entry. 224 | #[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] 225 | pub struct LogIndex(u64); 226 | impl LogIndex { 227 | pub fn as_u64(self) -> u64 { 228 | self.0 229 | } 230 | } 231 | impl From for LogIndex { 232 | fn from(val: u64) -> LogIndex { 233 | LogIndex(val) 234 | } 235 | } 236 | impl Into for LogIndex { 237 | fn into(self) -> u64 { 238 | self.0 239 | } 240 | } 241 | impl ops::Add for LogIndex { 242 | type Output = LogIndex; 243 | fn add(self, rhs: u64) -> LogIndex { 244 | LogIndex(self.0.checked_add(rhs).expect("overflow while incrementing LogIndex")) 245 | } 246 | } 247 | impl ops::Sub for LogIndex { 248 | type Output = LogIndex; 249 | fn sub(self, rhs: u64) -> LogIndex { 250 | LogIndex(self.0.checked_sub(rhs).expect("underflow while decrementing LogIndex")) 251 | } 252 | } 253 | /// Find the offset between two log indices. 254 | impl ops::Sub for LogIndex { 255 | type Output = u64; 256 | fn sub(self, rhs: LogIndex) -> u64 { 257 | self.0.checked_sub(rhs.0).expect("underflow while subtracting LogIndex") 258 | } 259 | } 260 | impl fmt::Display for LogIndex { 261 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 262 | fmt::Display::fmt(&self.0, f) 263 | } 264 | } 265 | 266 | /// The ID of a Raft server. Must be unique among the participants in a 267 | /// consensus group. 268 | #[derive(Copy, Clone, Hash, PartialEq, Eq)] 269 | pub struct ServerId(u64); 270 | 271 | impl ServerId { 272 | pub fn as_u64(self) -> u64 { 273 | self.0 274 | } 275 | } 276 | impl From for ServerId { 277 | fn from(val: u64) -> ServerId { 278 | ServerId(val) 279 | } 280 | } 281 | impl Into for ServerId { 282 | fn into(self) -> u64 { 283 | self.0 284 | } 285 | } 286 | impl fmt::Debug for ServerId { 287 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 288 | write!(f, "ServerId({})", self.0) 289 | } 290 | } 291 | impl fmt::Display for ServerId { 292 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 293 | fmt::Display::fmt(&self.0, f) 294 | } 295 | } 296 | 297 | /// The ID of a Raft client. 298 | #[derive(Copy, Clone, Hash, PartialEq, Eq)] 299 | pub struct ClientId(Uuid); 300 | impl ClientId { 301 | fn new() -> ClientId { 302 | ClientId(Uuid::new_v4()) 303 | } 304 | fn as_bytes(&self) -> &[u8] { 305 | self.0.as_bytes() 306 | } 307 | fn from_bytes(bytes: &[u8]) -> Result { 308 | match Uuid::from_bytes(bytes) { 309 | Some(uuid) => Ok(ClientId(uuid)), 310 | None => Err(Error::Raft(RaftError::InvalidClientId)), 311 | } 312 | } 313 | } 314 | impl fmt::Debug for ClientId { 315 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 316 | write!(f, "ClientId({})", self.0) 317 | } 318 | } 319 | impl fmt::Display for ClientId { 320 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 321 | fmt::Display::fmt(&self.0, f) 322 | } 323 | } 324 | -------------------------------------------------------------------------------- /src/messages.capnp: -------------------------------------------------------------------------------- 1 | @0xbdca3d7c76dab735; 2 | 3 | struct ConnectionPreamble { 4 | # Every connection opened to a Raft server, whether it is from a peer server 5 | # or a client, must begin with a ConnectionPreamble message. The Raft server 6 | # will not reply to this message, and it is safe for the connecting process 7 | # to immediately begin sending further messages. The connecting process must 8 | # include its ID, which indicates if the connecting process is a server or 9 | # client. 10 | 11 | id :union { 12 | server @0 :Peer; 13 | # Indicates that the connecting process is a Raft peer, and that all 14 | # further messages in the connection (in both directions) will be of 15 | # type Message. 16 | 17 | client @1 :Data; 18 | # Indicates that the connecting process is a client, and that all 19 | # further messages sent by the client will be of type ClientRequest, and 20 | # all replys from the server to the client will be of type 21 | # ClientResponse. 22 | } 23 | } 24 | 25 | struct Peer { 26 | id @0 :UInt64; 27 | 28 | addr @1 :Text; 29 | # The address to use for reconnecting or to redirect clients to 30 | # when not leader. 31 | } 32 | 33 | struct Entry { 34 | # A log entry. 35 | 36 | term @0 :UInt64; 37 | # The term of the entry. 38 | 39 | data @1 :Data; 40 | # The user-defined data of the entry. 41 | } 42 | 43 | struct Message { 44 | 45 | union { 46 | appendEntriesRequest @0 :AppendEntriesRequest; 47 | appendEntriesResponse @1 :AppendEntriesResponse; 48 | requestVoteResponse @2 :RequestVoteResponse; 49 | requestVoteRequest @3 :RequestVoteRequest; 50 | } 51 | } 52 | 53 | struct AppendEntriesRequest { 54 | 55 | term @0 :UInt64; 56 | # The leader's term. 57 | 58 | prevLogIndex @1 :UInt64; 59 | # Index of log entry immediately preceding new ones. 60 | 61 | prevLogTerm @2 :UInt64; 62 | # Term of prevLogIndex entry. 63 | 64 | entries @3 :List(Entry); 65 | # Log entries to store (empty for heartbeat; may send more than one for 66 | # efficiency). 67 | 68 | leaderCommit @4 :UInt64; 69 | # The Leader’s commit log index. 70 | } 71 | 72 | struct AppendEntriesResponse { 73 | 74 | term @0 :UInt64; 75 | # The responder's current term. 76 | 77 | union { 78 | success @1 :UInt64; 79 | # The `AppendEntries` request was a success. The responder's latest log 80 | # index is returned. 81 | 82 | staleTerm @2 :Void; 83 | # The `AppendEntries` request failed because the follower has a greater term 84 | # than the leader. 85 | 86 | inconsistentPrevEntry @3 :UInt64; 87 | # The `AppendEntries` request failed because the follower failed the 88 | # previous entry term and index checks. Includes the index of the 89 | # inconsistent entry. 90 | 91 | internalError @4 :Text; 92 | # an internal error occured; a description is included. 93 | } 94 | } 95 | 96 | struct RequestVoteRequest { 97 | 98 | term @0 :UInt64; 99 | # The candidate's term. 100 | 101 | lastLogIndex @1 :UInt64; 102 | # The index of the candidate's last log entry. 103 | 104 | lastLogTerm @2 :UInt64; 105 | # The term of the candidate's last log entry. 106 | } 107 | 108 | struct RequestVoteResponse { 109 | 110 | term @0 :UInt64; 111 | # The responder's current term. 112 | 113 | union { 114 | granted @1 :Void; 115 | # The voter voted for the candidate. 116 | 117 | staleTerm @2 :Void; 118 | # The `RequestVote` request failed because the voter has a greater term 119 | # than the candidate. 120 | 121 | alreadyVoted @3 :Void; 122 | # The voter did not vote for the candidate, because the voter already voted 123 | # in the term. 124 | 125 | inconsistentLog @4 :Void; 126 | # The `RequestVote` request failed because the candidate's log is not 127 | # up-to-date with the voter's log. 128 | 129 | internalError @5 :Text; 130 | # An internal error occurred; a description is included. 131 | } 132 | } 133 | 134 | struct ClientRequest { 135 | union { 136 | ping @0 :PingRequest; 137 | proposal @1 :ProposalRequest; 138 | query @2 :QueryRequest; 139 | } 140 | } 141 | 142 | struct ClientResponse { 143 | union { 144 | ping @0 :PingResponse; 145 | proposal @1 :CommandResponse; 146 | query @2 :CommandResponse; 147 | } 148 | } 149 | 150 | struct PingRequest { 151 | } 152 | 153 | struct PingResponse { 154 | 155 | term @0 :UInt64; 156 | # The server's current term. 157 | 158 | index @1 :UInt64; 159 | # The server's current index. 160 | 161 | state :union { 162 | # The server's current state. 163 | leader @2 :Void; 164 | follower @3 :Void; 165 | candidate @4 :Void; 166 | } 167 | } 168 | 169 | struct ProposalRequest { 170 | entry @0 :Data; 171 | # An entry to append. 172 | } 173 | 174 | struct QueryRequest { 175 | query @0 :Data; 176 | # An query to issue to the state machine. 177 | } 178 | 179 | struct CommandResponse { 180 | union { 181 | success @0 :Data; 182 | # The proposal succeeded. 183 | 184 | unknownLeader @1 :Void; 185 | # The proposal failed because the Raft node is not the leader, and does 186 | # not know who the leader is. 187 | 188 | notLeader @2 :Text; 189 | # The client request failed because the Raft node is not the leader. 190 | # The value returned may be the address of the current leader. 191 | } 192 | } 193 | -------------------------------------------------------------------------------- /src/messages.rs: -------------------------------------------------------------------------------- 1 | //! Utility functions for working with Cap'n Proto Raft messages. 2 | #![allow(dead_code)] 3 | 4 | use std::net::SocketAddr; 5 | use std::rc::Rc; 6 | 7 | use capnp::message::{Builder, HeapAllocator}; 8 | 9 | use {ClientId, Term, LogIndex, ServerId}; 10 | use messages_capnp::{client_request, client_response, connection_preamble, message}; 11 | 12 | // ConnectionPreamble 13 | 14 | pub fn server_connection_preamble(id: ServerId, addr: &SocketAddr) -> Rc> { 15 | let mut message = Builder::new_default(); 16 | { 17 | let mut server = message.init_root::() 18 | .init_id() 19 | .init_server(); 20 | server.set_addr(&format!("{}", addr)); 21 | server.set_id(id.as_u64()); 22 | } 23 | Rc::new(message) 24 | } 25 | 26 | pub fn client_connection_preamble(id: ClientId) -> Rc> { 27 | let mut message = Builder::new_default(); 28 | { 29 | message.init_root::() 30 | .init_id() 31 | .set_client(id.as_bytes()); 32 | } 33 | Rc::new(message) 34 | } 35 | 36 | // AppendEntries 37 | 38 | pub fn append_entries_request(term: Term, 39 | prev_log_index: LogIndex, 40 | prev_log_term: Term, 41 | entries: &[(Term, &[u8])], 42 | leader_commit: LogIndex) 43 | -> Rc> { 44 | let mut message = Builder::new_default(); 45 | { 46 | let mut request = message.init_root::() 47 | .init_append_entries_request(); 48 | request.set_term(term.as_u64()); 49 | request.set_prev_log_index(prev_log_index.as_u64()); 50 | request.set_prev_log_term(prev_log_term.as_u64()); 51 | request.set_leader_commit(leader_commit.as_u64()); 52 | 53 | let mut entry_list = request.init_entries(entries.len() as u32); 54 | for (n, entry) in entries.iter().enumerate() { 55 | let mut slot = entry_list.borrow().get(n as u32); 56 | slot.set_term(entry.0.into()); 57 | slot.set_data(entry.1); 58 | } 59 | } 60 | Rc::new(message) 61 | } 62 | 63 | pub fn append_entries_response_success(term: Term, 64 | log_index: LogIndex) 65 | -> Rc> { 66 | let mut message = Builder::new_default(); 67 | { 68 | let mut response = message.init_root::() 69 | .init_append_entries_response(); 70 | response.set_term(term.as_u64()); 71 | response.set_success(log_index.as_u64()); 72 | } 73 | Rc::new(message) 74 | } 75 | 76 | pub fn append_entries_response_stale_term(term: Term) -> Rc> { 77 | let mut message = Builder::new_default(); 78 | { 79 | let mut response = message.init_root::() 80 | .init_append_entries_response(); 81 | response.set_term(term.as_u64()); 82 | response.set_stale_term(()); 83 | } 84 | Rc::new(message) 85 | } 86 | 87 | pub fn append_entries_response_inconsistent_prev_entry(term: Term, 88 | index: LogIndex) 89 | -> Rc> { 90 | let mut message = Builder::new_default(); 91 | { 92 | let mut response = message.init_root::() 93 | .init_append_entries_response(); 94 | response.set_term(term.as_u64()); 95 | response.set_inconsistent_prev_entry(index.into()); 96 | } 97 | Rc::new(message) 98 | } 99 | 100 | pub fn append_entries_response_internal_error(term: Term, 101 | error: &str) 102 | -> Rc> { 103 | let mut message = Builder::new_default(); 104 | { 105 | let mut response = message.init_root::() 106 | .init_append_entries_response(); 107 | response.set_term(term.as_u64()); 108 | response.set_internal_error(error); 109 | } 110 | Rc::new(message) 111 | } 112 | 113 | // RequestVote 114 | 115 | pub fn request_vote_request(term: Term, 116 | last_log_index: LogIndex, 117 | last_log_term: Term) 118 | -> Rc> { 119 | let mut message = Builder::new_default(); 120 | { 121 | let mut request = message.init_root::() 122 | .init_request_vote_request(); 123 | request.set_term(term.as_u64()); 124 | request.set_last_log_index(last_log_index.as_u64()); 125 | request.set_last_log_term(last_log_term.as_u64()); 126 | } 127 | Rc::new(message) 128 | } 129 | 130 | pub fn request_vote_response_granted(term: Term) -> Rc> { 131 | let mut message = Builder::new_default(); 132 | { 133 | let mut response = message.init_root::() 134 | .init_request_vote_response(); 135 | response.set_term(term.as_u64()); 136 | response.set_granted(()); 137 | } 138 | Rc::new(message) 139 | } 140 | 141 | pub fn request_vote_response_stale_term(term: Term) -> Rc> { 142 | let mut message = Builder::new_default(); 143 | { 144 | let mut response = message.init_root::() 145 | .init_request_vote_response(); 146 | response.set_term(term.as_u64()); 147 | response.set_stale_term(()); 148 | } 149 | Rc::new(message) 150 | } 151 | 152 | pub fn request_vote_response_already_voted(term: Term) -> Rc> { 153 | let mut message = Builder::new_default(); 154 | { 155 | let mut response = message.init_root::() 156 | .init_request_vote_response(); 157 | response.set_term(term.as_u64()); 158 | response.set_already_voted(()); 159 | } 160 | Rc::new(message) 161 | } 162 | 163 | pub fn request_vote_response_inconsistent_log(term: Term) -> Rc> { 164 | let mut message = Builder::new_default(); 165 | { 166 | let mut response = message.init_root::() 167 | .init_request_vote_response(); 168 | response.set_term(term.as_u64()); 169 | response.set_inconsistent_log(()); 170 | } 171 | Rc::new(message) 172 | } 173 | 174 | pub fn request_vote_response_internal_error(term: Term, error: &str) -> Rc> { 175 | let mut message = Builder::new_default(); 176 | { 177 | let mut response = message.init_root::() 178 | .init_request_vote_response(); 179 | response.set_term(term.as_u64()); 180 | response.set_internal_error(error); 181 | } 182 | Rc::new(message) 183 | } 184 | 185 | // Ping 186 | 187 | pub fn ping_request() -> Builder { 188 | let mut message = Builder::new_default(); 189 | { 190 | message.init_root::() 191 | .init_ping(); 192 | } 193 | message 194 | } 195 | 196 | // Query 197 | 198 | pub fn query_request(entry: &[u8]) -> Builder { 199 | let mut message = Builder::new_default(); 200 | { 201 | message.init_root::() 202 | .init_query() 203 | .set_query(entry); 204 | } 205 | message 206 | } 207 | 208 | 209 | // Proposal 210 | 211 | pub fn proposal_request(entry: &[u8]) -> Builder { 212 | let mut message = Builder::new_default(); 213 | { 214 | message.init_root::() 215 | .init_proposal() 216 | .set_entry(entry); 217 | } 218 | message 219 | } 220 | 221 | // Query / Proposal Response 222 | 223 | pub fn command_response_success(data: &[u8]) -> Rc> { 224 | let mut message = Builder::new_default(); 225 | { 226 | message.init_root::() 227 | .init_proposal() 228 | .set_success(data); 229 | } 230 | Rc::new(message) 231 | } 232 | 233 | pub fn command_response_unknown_leader() -> Rc> { 234 | let mut message = Builder::new_default(); 235 | { 236 | message.init_root::() 237 | .init_proposal() 238 | .set_unknown_leader(()); 239 | } 240 | Rc::new(message) 241 | } 242 | 243 | pub fn command_response_not_leader(leader_hint: &SocketAddr) -> Rc> { 244 | let mut message = Builder::new_default(); 245 | { 246 | message.init_root::() 247 | .init_proposal() 248 | .set_not_leader(&format!("{}", leader_hint)); 249 | } 250 | Rc::new(message) 251 | } 252 | -------------------------------------------------------------------------------- /src/persistent_log/fs.rs: -------------------------------------------------------------------------------- 1 | use std::{error, fmt, fs, path, result}; 2 | use std::io::prelude::*; 3 | use std::io::{BufReader, BufWriter, SeekFrom}; 4 | 5 | use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; 6 | use persistent_log::Log; 7 | use LogIndex; 8 | use ServerId; 9 | use Term; 10 | 11 | /// This is a `Log` implementation that stores entries in the filesystem 12 | /// as well as in a struct. It is chiefly intended for testing. 13 | /// 14 | /// # Panic 15 | /// 16 | /// No bounds checking is performed and attempted access to non-existing log 17 | /// indexes will panic. 18 | 19 | 20 | /// Error type for FsLog 21 | 22 | #[derive(Debug, PartialEq, Eq)] 23 | pub struct Error; 24 | 25 | impl fmt::Display for Error { 26 | fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { 27 | write!(fmt, "An error occurred") 28 | } 29 | } 30 | 31 | impl error::Error for Error { 32 | fn description(&self) -> &str { 33 | "An error occurred" 34 | } 35 | } 36 | 37 | impl ::std::convert::From<::std::io::Error> for Error { 38 | fn from(_err: ::std::io::Error) -> Error { 39 | Error 40 | } 41 | } 42 | 43 | pub type Result = result::Result; 44 | pub type Entry = (Term, Vec); 45 | 46 | /// Version of the log file format. A logfile will always start with an eight 47 | /// byte version specifier. If the format ever changes, this version will be 48 | /// updated, so FsLog will not read the log incorrectly. 49 | const VERSION: u64 = 1; 50 | 51 | /// Stores log on disk as 8 bytes for the version identifier, 8 bytes for 52 | /// current_term, 8 bytes for voted_for, and as much as needed for the log. 53 | /// Each log entry is stored as an 8 byte length specifier which is the total 54 | /// length of the entry in bytes, including the length specifier, followed by 8 55 | /// bytes specifying the term, plus a variable length entry, which is the 56 | /// serialized command sent to raft by the client. 57 | #[derive(Debug)] 58 | pub struct FsLog { 59 | reader: BufReader, 60 | writer: BufWriter, 61 | current_term: Term, 62 | voted_for: Option, 63 | entries: Vec<(Term, Vec)>, 64 | offsets: Vec, 65 | } 66 | 67 | 68 | impl FsLog { 69 | pub fn new(filename: &path::Path) -> Result { 70 | 71 | let mut w = BufWriter::new( 72 | fs::OpenOptions::new() 73 | .create(true) 74 | .write(true) 75 | .open(&filename)?); 76 | 77 | let filelen = w.get_ref().metadata()?.len(); 78 | 79 | if filelen == 0 { 80 | w.write_u64::(VERSION)?; // Term (0) 81 | w.write_u64::(0)?; // Term (0) 82 | w.write_u64::(::max_value())?; // Voted for (None) 83 | w.flush()?; 84 | } 85 | 86 | let mut r = BufReader::new(fs::File::open(&filename)?); 87 | 88 | let version = r.read_u64::()?; 89 | if version != VERSION { 90 | return Err(Error); 91 | } 92 | let current_term: Term = r.read_u64::()?.into(); 93 | let voted_for: Option = match r.read_u64::()? { 94 | x if x == ::max_value() => None, 95 | x => Some(x.into()) 96 | }; 97 | 98 | let mut log = FsLog { 99 | reader: r, 100 | writer: w, 101 | current_term: current_term, 102 | voted_for: voted_for, 103 | entries: Vec::new(), 104 | offsets: Vec::new(), 105 | }; 106 | 107 | let mut offset = 24; // The size of the header. 108 | while offset < filelen { 109 | log.offsets.push(offset); 110 | let entry = log.read_entry(None)?; 111 | log.entries.push(entry); 112 | offset = log.reader.seek(SeekFrom::Current(0))?; 113 | } 114 | Ok(log) 115 | } 116 | 117 | fn write_term(&mut self) -> Result<()> { 118 | self.writer.seek(SeekFrom::Start(8))?; 119 | self.writer.write_u64::(self.current_term.into())?; 120 | // Set voted_for to None 121 | self.writer.write_u64::(::max_value())?; 122 | self.writer.flush()?; 123 | Ok(()) 124 | } 125 | 126 | fn write_voted_for(&mut self) -> Result<()> { 127 | self.writer.seek(SeekFrom::Start(16))?; 128 | self.writer.write_u64::( 129 | match self.voted_for { 130 | None => ::max_value(), 131 | Some(ServerId(n)) => n, 132 | } 133 | )?; 134 | self.writer.flush()?; 135 | Ok(()) 136 | } 137 | 138 | fn read_entry(&mut self, index: Option) -> Result { 139 | // Could be more efficient about not copying data here. 140 | if let Some(index) = index { 141 | let offset = self.offsets.get(index).ok_or(Error)?; 142 | self.reader.seek(SeekFrom::Start(*offset))?; 143 | } 144 | let length = self.reader.read_u64::()? as usize; 145 | let term = self.reader.read_u64::()?.into(); 146 | let mut command = vec![0u8; length - 16]; 147 | self.reader.read_exact(&mut command[..length - 16])?; 148 | Ok((term, command)) 149 | } 150 | 151 | fn truncate_file(&mut self, index: usize) -> Result<()> { 152 | match self.offsets.get(index) { 153 | None => {}, 154 | Some(offset) => self.writer.get_mut().set_len(*offset)?, 155 | }; 156 | self.reader.seek(SeekFrom::End(0))?; // Clear the buffer 157 | self.writer.seek(SeekFrom::End(0))?; // Clear the buffer 158 | Ok(()) 159 | } 160 | 161 | ///Add an entry to the log at the current location 162 | fn write_entry(&mut self, index: usize, term: Term, command: &[u8]) -> Result<()> { 163 | if index > self.entries.len() { 164 | Err(Error) 165 | } else { 166 | let new_offset = self.reader.seek(SeekFrom::End(0))?; 167 | self.offsets.push(new_offset); 168 | let entry_len = (command.len() + 16) as u64; 169 | self.writer.write_u64::(entry_len)?; 170 | self.writer.write_u64::(term.into())?; 171 | self.writer.write_all(&command[..])?; 172 | Ok(()) 173 | } 174 | } 175 | 176 | fn rewrite_entries(&mut self, from: LogIndex, entries: &[(Term, &[u8])]) -> Result<()> { 177 | assert!(self.latest_log_index()? + 1 >= from); 178 | let mut index = (from - 1).as_u64() as usize; 179 | self.truncate_file(index)?; 180 | self.entries.truncate(index); 181 | self.offsets.truncate(index); 182 | self.entries.extend(entries.iter().map(|&(term, command)| (term, command.to_vec()))); 183 | for &(term, command) in entries { 184 | self.write_entry(index, term, command)?; 185 | index += 1; 186 | } 187 | self.writer.flush()?; 188 | Ok(()) 189 | } 190 | } 191 | 192 | 193 | impl Log for FsLog { 194 | type Error = Error; 195 | 196 | fn current_term(&self) -> Result { 197 | Ok(self.current_term) 198 | } 199 | 200 | fn set_current_term(&mut self, term: Term) -> Result<()> { 201 | self.current_term = term; 202 | self.voted_for = None; 203 | self.write_term()?; 204 | Ok(()) 205 | } 206 | 207 | fn inc_current_term(&mut self) -> Result { 208 | self.current_term = self.current_term + 1; 209 | self.voted_for = None; 210 | self.write_term()?; 211 | self.current_term() 212 | } 213 | 214 | fn voted_for(&self) -> Result> { 215 | Ok(self.voted_for) 216 | } 217 | 218 | fn set_voted_for(&mut self, address: ServerId) -> Result<()> { 219 | self.voted_for = Some(address); 220 | self.write_voted_for()?; 221 | Ok(()) 222 | } 223 | 224 | fn latest_log_index(&self) -> Result { 225 | Ok(LogIndex(self.entries.len() as u64)) 226 | } 227 | 228 | fn latest_log_term(&self) -> Result { 229 | let len = self.entries.len(); 230 | if len == 0 { 231 | Ok(Term::from(0)) 232 | } else { 233 | Ok(self.entries[len - 1].0) 234 | } 235 | } 236 | 237 | fn entry(&self, index: LogIndex) -> Result<(Term, &[u8])> { 238 | let (term, ref bytes) = self.entries[(index - 1).as_u64() as usize]; 239 | Ok((term, bytes)) 240 | } 241 | 242 | /// Append entries sent from the leader. 243 | fn append_entries(&mut self, 244 | from: LogIndex, 245 | entries: &[(Term, &[u8])]) 246 | -> Result<()> { 247 | assert!(self.latest_log_index()? + 1 >= from); 248 | let from_idx = (from - 1).as_u64() as usize; 249 | for idx in 0..entries.len() { 250 | match self.entries.get(from_idx + idx).map(|entry| entry.0) { 251 | Some(term) => { 252 | let sent_term = entries[idx].0; 253 | if term == sent_term { 254 | continue; 255 | } else { 256 | self.rewrite_entries(from + idx as u64, &entries[idx..])?; 257 | break; 258 | } 259 | }, 260 | None => { 261 | self.rewrite_entries(from + idx as u64, &entries[idx..])?; 262 | break; 263 | } 264 | }; 265 | } 266 | Ok(()) 267 | } 268 | } 269 | 270 | 271 | impl Clone for FsLog { 272 | fn clone(&self) -> FsLog { 273 | // Wish I didn't have to unwrap the filehandles... 274 | FsLog { 275 | reader: BufReader::new(self.reader.get_ref().try_clone().expect("cloning self.reader")), 276 | writer: BufWriter::new(self.writer.get_ref().try_clone().expect("cloning self.writer")), 277 | current_term: self.current_term, 278 | voted_for: self.voted_for, 279 | entries: self.entries.clone(), 280 | offsets: self.offsets.clone(), 281 | } 282 | } 283 | } 284 | 285 | 286 | #[cfg(test)] 287 | mod test { 288 | use std::fs::remove_file; 289 | use std::path::Path; 290 | use super::*; 291 | use LogIndex; 292 | use ServerId; 293 | use Term; 294 | use persistent_log::Log; 295 | 296 | fn assert_entries_equal(store: &FsLog, expected: Vec<(Term, &[u8])>) { 297 | assert_eq!(LogIndex::from(expected.len() as u64), store.latest_log_index().unwrap()); 298 | assert_eq!(expected[expected.len() - 1].0, store.latest_log_term().unwrap()); 299 | for i in 0..expected.len() { 300 | assert_eq!(store.entry(LogIndex::from((i + 1) as u64)).unwrap(), expected[i]); 301 | } 302 | } 303 | 304 | #[test] 305 | fn test_current_term() { 306 | let filename = Path::new("/tmp/raft-store.1"); 307 | remove_file(&filename).unwrap_or(()); 308 | let mut store = FsLog::new(&filename).unwrap(); 309 | assert_eq!(Term(0), store.current_term().unwrap()); 310 | store.set_voted_for(ServerId::from(0)).unwrap(); 311 | store.set_current_term(Term(42)).unwrap(); 312 | assert_eq!(None, store.voted_for().unwrap()); 313 | assert_eq!(Term(42), store.current_term().unwrap()); 314 | store.inc_current_term().unwrap(); 315 | assert_eq!(Term(43), store.current_term().unwrap()); 316 | remove_file(&filename).unwrap(); 317 | } 318 | 319 | #[test] 320 | fn test_voted_for() { 321 | let filename = Path::new("/tmp/raft-store.2"); 322 | remove_file(&filename).unwrap_or(()); 323 | let mut store = FsLog::new(&filename).unwrap(); 324 | assert_eq!(None, store.voted_for().unwrap()); 325 | let id = ServerId::from(0); 326 | store.set_voted_for(id).unwrap(); 327 | assert_eq!(Some(id), store.voted_for().unwrap()); 328 | remove_file(&filename).unwrap(); 329 | } 330 | 331 | #[test] 332 | fn test_append_entries() { 333 | let filename = Path::new("/tmp/raft-store.3"); 334 | remove_file(&filename).unwrap_or(()); 335 | let mut store = FsLog::new(&filename).unwrap(); 336 | assert_eq!(LogIndex::from(0), store.latest_log_index().unwrap()); 337 | assert_eq!(Term::from(0), store.latest_log_term().unwrap()); 338 | 339 | // [0.1, 0.2, 0.3, 1.4] Initial log 340 | store.append_entries(LogIndex(1), 341 | &[(Term::from(0), &[1]), 342 | (Term::from(0), &[2]), 343 | (Term::from(0), &[3]), 344 | (Term::from(1), &[4])]) 345 | .unwrap(); 346 | assert_entries_equal(&store, vec![(Term::from(0), &*vec![1]), 347 | (Term::from(0), &*vec![2]), 348 | (Term::from(0), &*vec![3]), 349 | (Term::from(1), &*vec![4])]); 350 | 351 | // [0.1, 0.2, 0.3, 1.4] Empty log, no modification 352 | store.append_entries(LogIndex::from(3), &[]).unwrap(); 353 | assert_entries_equal(&store, vec![(Term::from(0), &*vec![1]), 354 | (Term::from(0), &*vec![2]), 355 | (Term::from(0), &*vec![3]), 356 | (Term::from(1), &*vec![4])]); 357 | 358 | // [0.1, 0.2, 0.3, 1.4] All match, non-exhaustive 359 | store.append_entries(LogIndex::from(2), 360 | &[(Term::from(0), &[2]), 361 | (Term::from(0), &[3])]) 362 | .unwrap(); 363 | assert_entries_equal(&store, vec![(Term::from(0), &[1u8]), 364 | (Term::from(0), &[2u8]), 365 | (Term::from(0), &[3u8]), 366 | (Term::from(1), &[4u8])]); 367 | 368 | // [0.1, 0.2, 2.5, 2.6] One match, two new 369 | store.append_entries(LogIndex::from(2), 370 | &[(Term::from(0), &[2]), 371 | (Term::from(2), &[5]), 372 | (Term::from(2), &[6])]) 373 | .unwrap(); 374 | assert_entries_equal(&store, vec![(Term::from(0), &*vec![1]), 375 | (Term::from(0), &*vec![2u8]), 376 | (Term::from(2), &*vec![5u8]), 377 | (Term::from(2), &*vec![6u8])]); 378 | 379 | // [0.1, 0.2, 4.7, 5.8] All new entries 380 | store.append_entries(LogIndex::from(3), &[(Term(4), &[7]), (Term(5), &[8])]).unwrap(); 381 | assert_entries_equal(&store, vec![(Term::from(0), &*vec![1]), 382 | (Term::from(0), &*vec![2]), 383 | (Term::from(4), &*vec![7]), 384 | (Term::from(5), &*vec![8])]); 385 | remove_file(&filename).unwrap(); 386 | } 387 | 388 | #[test] 389 | fn test_restore_log() { 390 | let filename = Path::new("/tmp/raft-store.4"); 391 | remove_file(&filename).unwrap_or(()); 392 | { 393 | let mut store = FsLog::new(&filename).unwrap(); 394 | store.set_current_term(Term(42)).unwrap(); 395 | store.set_voted_for(ServerId::from(4)).unwrap(); 396 | store.append_entries(LogIndex(1), 397 | &[(Term::from(0), &[1]), 398 | (Term::from(0), &[2]), 399 | (Term::from(0), &[3]), 400 | (Term::from(1), &[4])]) 401 | .unwrap(); 402 | } 403 | 404 | // New store with the same backing file starts with the same state. 405 | let store = FsLog::new(&filename).unwrap(); 406 | assert_eq!(store.voted_for().unwrap(), Some(ServerId::from(4))); 407 | assert_eq!(store.current_term().unwrap(), Term(42)); 408 | assert_entries_equal(&store, vec![(Term::from(0), &[1]), 409 | (Term::from(0), &[2]), 410 | (Term::from(0), &[3]), 411 | (Term::from(1), &[4])]); 412 | assert_eq!(store.offsets, [24, 41, 58, 75]); 413 | remove_file(&filename).unwrap(); 414 | } 415 | } 416 | -------------------------------------------------------------------------------- /src/persistent_log/mem.rs: -------------------------------------------------------------------------------- 1 | use std::{error, fmt, result}; 2 | 3 | use persistent_log::Log; 4 | use LogIndex; 5 | use ServerId; 6 | use Term; 7 | 8 | /// This is a `Log` implementation that stores entries in a simple in-memory vector. Other data 9 | /// is stored in a struct. It is chiefly intended for testing. 10 | /// 11 | /// # Panic 12 | /// 13 | /// No bounds checking is performed and attempted access to non-existing log 14 | /// indexes will panic. 15 | #[derive(Clone, Debug)] 16 | pub struct MemLog { 17 | current_term: Term, 18 | voted_for: Option, 19 | entries: Vec<(Term, Vec)>, 20 | } 21 | 22 | /// Non-instantiable error type for MemLog 23 | pub enum Error { } 24 | 25 | impl fmt::Display for Error { 26 | fn fmt(&self, _fmt: &mut fmt::Formatter) -> fmt::Result { 27 | unreachable!() 28 | } 29 | } 30 | 31 | impl fmt::Debug for Error { 32 | fn fmt(&self, _fmt: &mut fmt::Formatter) -> fmt::Result { 33 | unreachable!() 34 | } 35 | } 36 | 37 | impl error::Error for Error { 38 | fn description(&self) -> &str { 39 | unreachable!() 40 | } 41 | } 42 | 43 | impl MemLog { 44 | pub fn new() -> MemLog { 45 | MemLog { 46 | current_term: Term(0), 47 | voted_for: None, 48 | entries: Vec::new(), 49 | } 50 | } 51 | } 52 | 53 | impl Log for MemLog { 54 | type Error = Error; 55 | 56 | fn current_term(&self) -> result::Result { 57 | Ok(self.current_term) 58 | } 59 | 60 | fn set_current_term(&mut self, term: Term) -> result::Result<(), Error> { 61 | self.voted_for = None; 62 | Ok(self.current_term = term) 63 | } 64 | 65 | fn inc_current_term(&mut self) -> result::Result { 66 | self.voted_for = None; 67 | self.current_term = self.current_term + 1; 68 | self.current_term() 69 | } 70 | 71 | fn voted_for(&self) -> result::Result, Error> { 72 | Ok(self.voted_for) 73 | } 74 | 75 | fn set_voted_for(&mut self, address: ServerId) -> result::Result<(), Error> { 76 | Ok(self.voted_for = Some(address)) 77 | } 78 | 79 | fn latest_log_index(&self) -> result::Result { 80 | Ok(LogIndex(self.entries.len() as u64)) 81 | } 82 | 83 | fn latest_log_term(&self) -> result::Result { 84 | let len = self.entries.len(); 85 | if len == 0 { 86 | Ok(Term::from(0)) 87 | } else { 88 | Ok(self.entries[len - 1].0) 89 | } 90 | } 91 | 92 | fn entry(&self, index: LogIndex) -> result::Result<(Term, &[u8]), Error> { 93 | let (term, ref bytes) = self.entries[(index - 1).as_u64() as usize]; 94 | Ok((term, bytes)) 95 | } 96 | 97 | fn append_entries(&mut self, 98 | from: LogIndex, 99 | entries: &[(Term, &[u8])]) 100 | -> result::Result<(), Error> { 101 | assert!(self.latest_log_index().unwrap() + 1 >= from); 102 | self.entries.truncate((from - 1).as_u64() as usize); 103 | Ok(self.entries.extend(entries.iter().map(|&(term, command)| (term, command.to_vec())))) 104 | } 105 | } 106 | 107 | #[cfg(test)] 108 | mod test { 109 | 110 | use super::*; 111 | use LogIndex; 112 | use ServerId; 113 | use Term; 114 | use persistent_log::Log; 115 | 116 | #[test] 117 | fn test_current_term() { 118 | let mut store = MemLog::new(); 119 | assert_eq!(Term(0), store.current_term().unwrap()); 120 | store.set_voted_for(ServerId::from(0)).unwrap(); 121 | store.set_current_term(Term(42)).unwrap(); 122 | assert_eq!(None, store.voted_for().unwrap()); 123 | assert_eq!(Term(42), store.current_term().unwrap()); 124 | store.inc_current_term().unwrap(); 125 | assert_eq!(Term(43), store.current_term().unwrap()); 126 | } 127 | 128 | #[test] 129 | fn test_voted_for() { 130 | let mut store = MemLog::new(); 131 | assert_eq!(None, store.voted_for().unwrap()); 132 | let id = ServerId::from(0); 133 | store.set_voted_for(id).unwrap(); 134 | assert_eq!(Some(id), store.voted_for().unwrap()); 135 | } 136 | 137 | #[test] 138 | fn test_append_entries() { 139 | let mut store = MemLog::new(); 140 | assert_eq!(LogIndex::from(0), store.latest_log_index().unwrap()); 141 | assert_eq!(Term::from(0), store.latest_log_term().unwrap()); 142 | 143 | // [0.1, 0.2, 0.3, 1.4] 144 | store.append_entries(LogIndex(1), 145 | &[(Term::from(0), &[1]), 146 | (Term::from(0), &[2]), 147 | (Term::from(0), &[3]), 148 | (Term::from(1), &[4])]) 149 | .unwrap(); 150 | assert_eq!(LogIndex::from(4), store.latest_log_index().unwrap()); 151 | assert_eq!(Term::from(1), store.latest_log_term().unwrap()); 152 | assert_eq!((Term::from(0), &*vec![1u8]), 153 | store.entry(LogIndex::from(1)).unwrap()); 154 | assert_eq!((Term::from(0), &*vec![2u8]), 155 | store.entry(LogIndex::from(2)).unwrap()); 156 | assert_eq!((Term::from(0), &*vec![3u8]), 157 | store.entry(LogIndex::from(3)).unwrap()); 158 | assert_eq!((Term::from(1), &*vec![4u8]), 159 | store.entry(LogIndex::from(4)).unwrap()); 160 | 161 | // [0.1, 0.2, 0.3] 162 | store.append_entries(LogIndex::from(4), &[]).unwrap(); 163 | assert_eq!(LogIndex(3), store.latest_log_index().unwrap()); 164 | assert_eq!(Term::from(0), store.latest_log_term().unwrap()); 165 | assert_eq!((Term::from(0), &*vec![1u8]), 166 | store.entry(LogIndex::from(1)).unwrap()); 167 | assert_eq!((Term::from(0), &*vec![2u8]), 168 | store.entry(LogIndex::from(2)).unwrap()); 169 | assert_eq!((Term::from(0), &*vec![3u8]), 170 | store.entry(LogIndex::from(3)).unwrap()); 171 | 172 | // [0.1, 0.2, 2.3, 3.4] 173 | store.append_entries(LogIndex::from(3), &[(Term(2), &[3]), (Term(3), &[4])]).unwrap(); 174 | assert_eq!(LogIndex(4), store.latest_log_index().unwrap()); 175 | assert_eq!(Term::from(3), store.latest_log_term().unwrap()); 176 | assert_eq!((Term::from(0), &*vec![1u8]), 177 | store.entry(LogIndex::from(1)).unwrap()); 178 | assert_eq!((Term::from(0), &*vec![2u8]), 179 | store.entry(LogIndex::from(2)).unwrap()); 180 | assert_eq!((Term::from(2), &*vec![3u8]), 181 | store.entry(LogIndex::from(3)).unwrap()); 182 | assert_eq!((Term::from(3), &*vec![4u8]), 183 | store.entry(LogIndex::from(4)).unwrap()); 184 | } 185 | } 186 | 187 | 188 | impl Default for MemLog { 189 | fn default() -> Self { 190 | MemLog::new() 191 | } 192 | } 193 | -------------------------------------------------------------------------------- /src/persistent_log/mod.rs: -------------------------------------------------------------------------------- 1 | //! The persistent storage of Raft state. 2 | //! 3 | //! In your consuming application you will want to implement this trait on one of your structures. 4 | //! This could adapt to a database, a file, or even just POD. 5 | //! 6 | //! *Note:* Your consuming application should not necessarily interface with this data. It is meant 7 | //! for internal use by the library, we simply chose not to be opinionated about how data is stored. 8 | 9 | mod fs; 10 | mod mem; 11 | 12 | use std::error; 13 | use std::fmt::Debug; 14 | use std::result; 15 | 16 | pub use persistent_log::fs::FsLog; 17 | pub use persistent_log::mem::{MemLog, Error}; 18 | 19 | use LogIndex; 20 | use Term; 21 | use ServerId; 22 | 23 | /// A store of persistent Raft state. 24 | pub trait Log: Clone + Debug + Send + 'static { 25 | type Error: error::Error + Debug + Sized + 'static; 26 | 27 | /// Returns the latest known term. 28 | fn current_term(&self) -> result::Result; 29 | 30 | /// Sets the current term to the provided value. The provided term must be greater than 31 | /// the current term. The `voted_for` value will be reset`. 32 | fn set_current_term(&mut self, term: Term) -> result::Result<(), Self::Error>; 33 | 34 | /// Increment the current term. The `voted_for` value will be reset. 35 | fn inc_current_term(&mut self) -> result::Result; 36 | 37 | /// Returns the candidate id of the candidate voted for in the current term (or none). 38 | fn voted_for(&self) -> result::Result, Self::Error>; 39 | 40 | /// Sets the candidate id voted for in the current term. 41 | fn set_voted_for(&mut self, server: ServerId) -> result::Result<(), Self::Error>; 42 | 43 | /// Returns the index of the latest persisted log entry (0 if the log is empty). 44 | fn latest_log_index(&self) -> result::Result; 45 | 46 | /// Returns the term of the latest persisted log entry (0 if the log is empty). 47 | fn latest_log_term(&self) -> result::Result; 48 | 49 | /// Returns the entry at the provided log index. 50 | fn entry(&self, index: LogIndex) -> result::Result<(Term, &[u8]), Self::Error>; 51 | 52 | /// Returns the given range of entries (excluding the right endpoint). 53 | fn entries(&self, 54 | lo: LogIndex, 55 | hi: LogIndex) 56 | -> result::Result, Self::Error> { 57 | // TODO: can make LogIndex compatible for use in ranges. 58 | (lo.as_u64()..hi.as_u64()) 59 | .map(|index| self.entry(LogIndex::from(index))) 60 | .collect::>() 61 | } 62 | 63 | 64 | /// Appends the provided entries to the log beginning at the given index. 65 | fn append_entries(&mut self, 66 | from: LogIndex, 67 | entries: &[(Term, &[u8])]) 68 | -> result::Result<(), Self::Error>; 69 | } 70 | -------------------------------------------------------------------------------- /src/server.rs: -------------------------------------------------------------------------------- 1 | //! `Server` is a Rust type which is responsible for coordinating with other remote `Server` 2 | //! instances, responding to commands from the `Client`, and applying commands to a local 3 | //! `StateMachine` consensus. A `Server` may be a `Leader`, `Follower`, or `Candidate` at any given 4 | //! time as described by the Raft Consensus Algorithm. 5 | 6 | use std::{fmt, io}; 7 | use std::str::FromStr; 8 | use std::collections::HashMap; 9 | use std::net::SocketAddr; 10 | use std::thread::{self, JoinHandle}; 11 | use std::rc::Rc; 12 | 13 | use mio::tcp::TcpListener; 14 | use mio::util::Slab; 15 | use mio::{EventLoop, EventSet, Handler, PollOpt, Token}; 16 | use mio::Timeout as TimeoutHandle; 17 | use capnp::message::{Builder, HeapAllocator}; 18 | 19 | use ClientId; 20 | use Result; 21 | use Error; 22 | use RaftError; 23 | use ServerId; 24 | use messages; 25 | use messages_capnp::connection_preamble; 26 | use consensus::{Consensus, Actions, ConsensusTimeout, TimeoutConfiguration}; 27 | use state_machine::StateMachine; 28 | use persistent_log::Log; 29 | use connection::{Connection, ConnectionKind}; 30 | 31 | const LISTENER: Token = Token(0); 32 | 33 | #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] 34 | 35 | pub enum ServerTimeout { 36 | Consensus(ConsensusTimeout), 37 | Reconnect(Token), 38 | } 39 | 40 | pub struct ServerBuilder 41 | where 42 | L: Log, 43 | M: StateMachine, 44 | { 45 | id: ServerId, 46 | addr: SocketAddr, 47 | peers: Option>, 48 | store: L, 49 | state_machine: M, 50 | max_connections: usize, 51 | election_min_millis: u64, 52 | election_max_millis: u64, 53 | heartbeat_millis: u64, 54 | } 55 | 56 | impl ServerBuilder 57 | where 58 | L: Log, 59 | M: StateMachine, 60 | { 61 | fn new(id: ServerId, addr: SocketAddr, store: L, state_machine: M) -> ServerBuilder { 62 | /// Create a ServerBuilder with default values 63 | /// for optional members. 64 | ServerBuilder { 65 | id: id, 66 | addr: addr, 67 | peers: None, 68 | store: store, 69 | state_machine: state_machine, 70 | max_connections: 128, 71 | election_min_millis: 150, 72 | election_max_millis: 350, 73 | heartbeat_millis: 60, 74 | } 75 | } 76 | 77 | pub fn finalize(self) -> Result> { 78 | Server::finalize( 79 | self.id, 80 | self.addr, 81 | self.peers.unwrap_or_else(HashMap::new), 82 | self.store, 83 | self.state_machine, 84 | self.election_min_millis, 85 | self.election_max_millis, 86 | self.heartbeat_millis, 87 | self.max_connections, 88 | ) 89 | } 90 | 91 | pub fn run(self) -> Result<()> { 92 | let mut server = self.finalize()?; 93 | server.run() 94 | } 95 | 96 | pub fn with_max_connections(mut self, count: usize) -> ServerBuilder { 97 | self.max_connections = count; 98 | self 99 | } 100 | 101 | pub fn with_election_min_millis(mut self, timeout: u64) -> ServerBuilder { 102 | self.election_min_millis = timeout; 103 | self 104 | } 105 | 106 | pub fn with_election_max_millis(mut self, timeout: u64) -> ServerBuilder { 107 | self.election_max_millis = timeout; 108 | self 109 | } 110 | 111 | pub fn with_heartbeat_millis(mut self, timeout: u64) -> ServerBuilder { 112 | self.heartbeat_millis = timeout; 113 | self 114 | } 115 | 116 | pub fn with_peers(mut self, peers: HashMap) -> ServerBuilder { 117 | self.peers = Some(peers); 118 | self 119 | } 120 | } 121 | 122 | /// The `Server` is responsible for receiving events from peer `Server` instance or clients, 123 | /// as well as managing election and heartbeat timeouts. When an event is received, it is applied 124 | /// to the local `Consensus`. The `Consensus` may optionally return a set of events to be 125 | /// dispatched to either remote peers or clients. 126 | /// 127 | /// ## Logging 128 | /// 129 | /// Server instances log events according to frequency and importance. It is recommended to use at 130 | /// least info level logging when running in production. The warn level is used for unexpected, 131 | /// but recoverable events. The info level is used for infrequent events such as connection resets 132 | /// and election results. The debug level is used for frequent events such as client proposals and 133 | /// heartbeats. The trace level is used for very high frequency debugging output. 134 | pub struct Server 135 | where L: Log, 136 | M: StateMachine 137 | { 138 | /// Id of this server. 139 | id: ServerId, 140 | 141 | /// Raft state machine consensus. 142 | consensus: Consensus, 143 | 144 | /// Connection listener. 145 | listener: TcpListener, 146 | 147 | /// Collection of connections indexed by token. 148 | connections: Slab, 149 | 150 | /// Index of peer id to connection token. 151 | peer_tokens: HashMap, 152 | 153 | /// Index of client id to connection token. 154 | client_tokens: HashMap, 155 | 156 | /// Currently registered consensus timeouts. 157 | consensus_timeouts: HashMap, 158 | 159 | /// Currently registered reconnection timeouts. 160 | reconnection_timeouts: HashMap, 161 | 162 | /// Configured timeouts 163 | timeout_config: TimeoutConfiguration, 164 | } 165 | 166 | /// The implementation of the Server. 167 | impl Server 168 | where L: Log, 169 | M: StateMachine 170 | { 171 | #[cfg_attr(feature = "cargo-clippy", allow(new_ret_no_self))] 172 | pub fn new( 173 | id: ServerId, 174 | addr: SocketAddr, 175 | store: L, 176 | state_machine: M,) -> ServerBuilder { 177 | ServerBuilder::new(id, addr, store, state_machine) 178 | } 179 | 180 | /// Creates a new instance of the server. 181 | /// *Gotcha:* `peers` must not contain the local `id`. 182 | #[cfg_attr(feature = "cargo-clippy", allow(too_many_arguments))] 183 | fn finalize( 184 | id: ServerId, 185 | addr: SocketAddr, 186 | peers: HashMap, 187 | store: L, 188 | state_machine: M, 189 | election_min_millis: u64, 190 | election_max_millis: u64, 191 | heartbeat_millis: u64, 192 | max_connections: usize) 193 | -> Result> { 194 | if peers.contains_key(&id) { 195 | return Err(Error::Raft(RaftError::InvalidPeerSet)); 196 | } 197 | 198 | let timeout_config = TimeoutConfiguration { 199 | election_min_ms: election_min_millis, 200 | election_max_ms: election_max_millis, 201 | heartbeat_ms: heartbeat_millis, 202 | }; 203 | let consensus = Consensus::new(id, peers.clone(), store, state_machine); 204 | let listener = try!(TcpListener::bind(&addr)); 205 | 206 | let mut server = Server { 207 | id: id, 208 | consensus: consensus, 209 | listener: listener, 210 | connections: Slab::new_starting_at(Token(1), max_connections), 211 | peer_tokens: HashMap::new(), 212 | client_tokens: HashMap::new(), 213 | consensus_timeouts: HashMap::new(), 214 | reconnection_timeouts: HashMap::new(), 215 | timeout_config: timeout_config, 216 | }; 217 | 218 | for (peer_id, peer_addr) in peers { 219 | let token: Token = try!(server.connections 220 | .insert(try!(Connection::peer(peer_id, peer_addr))) 221 | .map_err(|_| { 222 | Error::Raft(RaftError::ConnectionLimitReached) 223 | })); 224 | scoped_assert!(server.peer_tokens.insert(peer_id, token).is_none()); 225 | } 226 | Ok(server) 227 | } 228 | 229 | fn start_loop(&mut self) -> Result>> 230 | where 231 | L: Log, 232 | M: StateMachine 233 | { 234 | let mut event_loop = try!(EventLoop::>::new()); 235 | try!(event_loop.register(&self.listener, LISTENER, EventSet::all(), PollOpt::level())); 236 | let mut tokens = vec![]; 237 | for token in self.peer_tokens.values() { 238 | tokens.push(*token); 239 | } 240 | let id = self.id; 241 | let addr = self.listener.local_addr()?; 242 | for token in tokens { 243 | try!(self.connections[token].register(&mut event_loop, token)); 244 | self.send_message(&mut event_loop, 245 | token, 246 | messages::server_connection_preamble(id, &addr)); 247 | } 248 | Ok(event_loop) 249 | } 250 | /// Runs a new Raft server in the current thread. 251 | /// 252 | /// # Arguments 253 | /// 254 | /// * `id` - The ID of the new node. 255 | /// * `addr` - The address of the new node. 256 | /// * `peers` - The ID and address of all peers in the Raft cluster. 257 | /// * `store` - The persistent log store. 258 | /// * `state_machine` - The client state machine to which client commands will be applied. 259 | pub fn run(&mut self) -> Result<()> { 260 | let mut event_loop = try!(self.start_loop()); 261 | let actions = self.consensus.init(); 262 | self.execute_actions(&mut event_loop, actions); 263 | event_loop.run(self).map_err(From::from) 264 | } 265 | 266 | /// Spawns a new Raft server in a background thread. 267 | /// 268 | /// # Arguments 269 | /// 270 | /// * `id` - The ID of the new node. 271 | /// * `addr` - The address of the new node. 272 | /// * `peers` - The ID and address of all peers in the Raft cluster. 273 | /// * `store` - The persistent log store. 274 | /// * `state_machine` - The client state machine to which client commands will be applied. 275 | pub fn spawn(id: ServerId, 276 | addr: SocketAddr, 277 | peers: HashMap, 278 | store: L, 279 | state_machine: M) 280 | -> Result>> { 281 | thread::Builder::new() 282 | .name(format!("raft::Server({})", id)) 283 | .spawn(move || { 284 | let mut server = try!(Server::finalize(id, addr, peers, store, state_machine, 1500, 3000, 1000, 129)); 285 | server.run() 286 | }) 287 | .map_err(From::from) 288 | } 289 | /// Sends the message to the connection associated with the provided token. 290 | /// If sending the message fails, the connection is reset. 291 | fn send_message(&mut self, 292 | event_loop: &mut EventLoop>, 293 | token: Token, 294 | message: Rc>) { 295 | match self.connections[token].send_message(message) { 296 | Ok(false) => (), 297 | Ok(true) => { 298 | self.connections[token] 299 | .reregister(event_loop, token) 300 | .unwrap_or_else(|_| self.reset_connection(event_loop, token)); 301 | } 302 | Err(error) => { 303 | scoped_warn!("{:?}: error while sending message: {:?}", self, error); 304 | self.reset_connection(event_loop, token); 305 | } 306 | } 307 | } 308 | 309 | fn execute_actions(&mut self, event_loop: &mut EventLoop>, actions: Actions) { 310 | scoped_trace!("executing actions: {:?}", actions); 311 | let Actions { peer_messages, 312 | client_messages, 313 | timeouts, 314 | clear_timeouts, 315 | clear_peer_messages } = actions; 316 | 317 | if clear_peer_messages { 318 | for &token in self.peer_tokens.values() { 319 | self.connections[token].clear_messages(); 320 | } 321 | } 322 | for (peer, message) in peer_messages { 323 | let token = self.peer_tokens[&peer]; 324 | self.send_message(event_loop, token, message); 325 | } 326 | for (client, message) in client_messages { 327 | if let Some(&token) = self.client_tokens.get(&client) { 328 | self.send_message(event_loop, token, message); 329 | } 330 | } 331 | if clear_timeouts { 332 | for (timeout, &handle) in &self.consensus_timeouts { 333 | scoped_assert!(event_loop.clear_timeout(handle), 334 | "unable to clear timeout: {:?}", 335 | timeout); 336 | } 337 | self.consensus_timeouts.clear(); 338 | } 339 | for timeout in timeouts { 340 | let duration = timeout.duration_ms(&self.timeout_config); 341 | 342 | // Registering a timeout may only fail if the maximum number of timeouts 343 | // is already registered, which is by default 65,536. We use a 344 | // maximum of one timeout per peer, so this unwrap should be safe. 345 | let handle = event_loop.timeout_ms(ServerTimeout::Consensus(timeout), duration) 346 | .unwrap(); 347 | self.consensus_timeouts 348 | .insert(timeout, handle) 349 | .map(|handle| { 350 | scoped_assert!(event_loop.clear_timeout(handle), 351 | "unable to clear timeout: {:?}", 352 | timeout) 353 | }); 354 | } 355 | } 356 | 357 | /// Resets the connection corresponding to the provided token. 358 | /// 359 | /// If the connection is to a peer, the server will attempt to reconnect after a waiting 360 | /// period. 361 | /// 362 | /// If the connection is to a client or unknown it will be closed. 363 | fn reset_connection(&mut self, event_loop: &mut EventLoop>, token: Token) { 364 | let kind = *self.connections[token].kind(); 365 | match kind { 366 | ConnectionKind::Peer(..) => { 367 | // Crash if reseting the connection fails. 368 | let (timeout, handle) = self.connections[token] 369 | .reset_peer(event_loop, token) 370 | .unwrap(); 371 | 372 | scoped_assert!(self.reconnection_timeouts.insert(token, handle).is_none(), 373 | "timeout already registered: {:?}", 374 | timeout); 375 | } 376 | ConnectionKind::Client(ref id) => { 377 | self.connections.remove(token).expect("unable to find client connection"); 378 | scoped_assert!(self.client_tokens.remove(id).is_some(), 379 | "client {:?} not connected", 380 | id); 381 | } 382 | ConnectionKind::Unknown => { 383 | self.connections.remove(token).expect("unable to find unknown connection"); 384 | } 385 | } 386 | } 387 | 388 | /// Reads messages from the connection until no more are available. 389 | /// 390 | /// If the connection returns an error on any operation, or any message fails to be 391 | /// deserialized, an error result is returned. 392 | fn readable(&mut self, event_loop: &mut EventLoop>, token: Token) -> Result<()> { 393 | scoped_trace!("{:?}: readable event", self.connections[token]); 394 | // Read messages from the connection until there are no more. 395 | while let Some(message) = try!(self.connections[token].readable()) { 396 | match *self.connections[token].kind() { 397 | ConnectionKind::Peer(id) => { 398 | let mut actions = Actions::new(); 399 | self.consensus.apply_peer_message(id, &message, &mut actions); 400 | self.execute_actions(event_loop, actions); 401 | } 402 | ConnectionKind::Client(id) => { 403 | let mut actions = Actions::new(); 404 | self.consensus.apply_client_message(id, &message, &mut actions); 405 | self.execute_actions(event_loop, actions); 406 | } 407 | ConnectionKind::Unknown => { 408 | let preamble = try!(message.get_root::()); 409 | match try!(preamble.get_id().which()) { 410 | connection_preamble::id::Which::Server(peer) => { 411 | let peer = try!(peer); 412 | let peer_id = ServerId(peer.get_id()); 413 | 414 | // Not the source address of this connection, but the 415 | // address the peer tells us it's listening on. 416 | let peer_addr = SocketAddr::from_str(try!(peer.get_addr())).unwrap(); 417 | scoped_debug!("received new connection from {:?} ({})", 418 | peer_id, 419 | peer_addr); 420 | 421 | self.connections[token].set_kind(ConnectionKind::Peer(peer_id)); 422 | // Use the advertised address, not the remote's source 423 | // address, for future retries in this connection. 424 | self.connections[token].set_addr(peer_addr); 425 | 426 | let prev_token = Some(self.peer_tokens 427 | .insert(peer_id, token) 428 | .expect("peer token not found")); 429 | 430 | // Close the existing connection, if any. 431 | // Currently, prev_token is never `None`; see above. 432 | // With config changes, this will have to be handled. 433 | match prev_token { 434 | Some(tok) => { 435 | self.connections 436 | .remove(tok) 437 | .expect("peer connection not found"); 438 | 439 | // Clear any timeouts associated with the existing connection. 440 | self.reconnection_timeouts 441 | .remove(&tok) 442 | .map(|handle| { 443 | scoped_assert!(event_loop.clear_timeout(handle)) 444 | }); 445 | } 446 | _ => unreachable!(), 447 | } 448 | // Notify consensus that the connection reset. 449 | let mut actions = Actions::new(); 450 | self.consensus.peer_connection_reset(peer_id, peer_addr, &mut actions); 451 | self.execute_actions(event_loop, actions); 452 | } 453 | connection_preamble::id::Which::Client(Ok(id)) => { 454 | let client_id = try!(ClientId::from_bytes(id)); 455 | scoped_debug!("received new client connection from {}", client_id); 456 | self.connections[token].set_kind(ConnectionKind::Client(client_id)); 457 | let prev_token = self.client_tokens 458 | .insert(client_id, token); 459 | scoped_assert!(prev_token.is_none(), 460 | "{:?}: two clients connected with the same id: {:?}", 461 | self, 462 | client_id); 463 | } 464 | _ => { 465 | return Err(Error::Raft(RaftError::UnknownConnectionType)); 466 | } 467 | } 468 | } 469 | } 470 | } 471 | Ok(()) 472 | } 473 | 474 | /// Accepts a new TCP connection, adds it to the connection slab, and registers it with the 475 | /// event loop. 476 | fn accept_connection(&mut self, event_loop: &mut EventLoop>) -> Result<()> { 477 | scoped_trace!("accept_connection"); 478 | self.listener 479 | .accept() 480 | .map_err(From::from) 481 | .and_then(|stream_opt| { 482 | stream_opt.ok_or_else(|| { 483 | Error::Io(io::Error::new(io::ErrorKind::WouldBlock, 484 | "listener.accept() returned None")) 485 | }) 486 | }) 487 | .and_then(|(stream, _)| Connection::unknown(stream)) 488 | .and_then(|conn| { 489 | self.connections 490 | .insert(conn) 491 | .map_err(|_| Error::Raft(RaftError::ConnectionLimitReached)) 492 | }) 493 | .and_then(|token| 494 | // Until this point if any failures occur the connection is simply dropped. From 495 | // this point down, the connection is stored in the slab, so dropping it would 496 | // result in a leaked TCP stream and slab entry. Instead of dropping the 497 | // connection, it will be reset if an error occurs. 498 | self.connections[token] 499 | .register(event_loop, token) 500 | .or_else(|_| { 501 | self.reset_connection(event_loop, token); 502 | Err(Error::Raft(RaftError::ConnectionRegisterFailed)) 503 | }) 504 | .map(|_| scoped_debug!("new connection accepted from {}", 505 | self.connections[token].addr()))) 506 | } 507 | } 508 | 509 | impl Handler for Server 510 | where L: Log, 511 | M: StateMachine 512 | { 513 | type Message = (); 514 | type Timeout = ServerTimeout; 515 | 516 | fn ready(&mut self, event_loop: &mut EventLoop>, token: Token, events: EventSet) { 517 | push_log_scope!("{:?}", self); 518 | scoped_trace!("ready; token: {:?}; events: {:?}", token, events); 519 | 520 | if events.is_error() { 521 | scoped_assert!(token != LISTENER, "unexpected error event from LISTENER"); 522 | scoped_warn!("{:?}: error event", self.connections[token]); 523 | self.reset_connection(event_loop, token); 524 | return; 525 | } 526 | 527 | if events.is_hup() { 528 | scoped_assert!(token != LISTENER, "unexpected hup event from LISTENER"); 529 | scoped_trace!("{:?}: hup event", self.connections[token]); 530 | self.reset_connection(event_loop, token); 531 | return; 532 | } 533 | 534 | if events.is_writable() { 535 | scoped_assert!(token != LISTENER, "unexpected writeable event for LISTENER"); 536 | if let Err(error) = self.connections[token].writable() { 537 | scoped_warn!("{:?}: failed write: {}", self.connections[token], error); 538 | self.reset_connection(event_loop, token); 539 | return; 540 | } 541 | if !events.is_readable() { 542 | self.connections[token] 543 | .reregister(event_loop, token) 544 | .unwrap_or_else(|_| self.reset_connection(event_loop, token)); 545 | } 546 | } 547 | 548 | if events.is_readable() { 549 | if token == LISTENER { 550 | self.accept_connection(event_loop) 551 | .unwrap_or_else(|error| scoped_warn!("unable to accept connection: {}", error)); 552 | } else { 553 | self.readable(event_loop, token) 554 | // Only reregister the connection with the event loop if no error occurs and 555 | // the connection is *not* reset. 556 | .and_then(|_| self.connections[token].reregister(event_loop, token)) 557 | .unwrap_or_else(|error| { 558 | scoped_warn!("{:?}: failed read: {}", 559 | self.connections[token], error); 560 | self.reset_connection(event_loop, token); 561 | }); 562 | } 563 | } 564 | } 565 | 566 | fn timeout(&mut self, event_loop: &mut EventLoop>, timeout: ServerTimeout) { 567 | push_log_scope!("{:?}", self); 568 | scoped_trace!("timeout: {:?}", &timeout); 569 | match timeout { 570 | ServerTimeout::Consensus(consensus) => { 571 | scoped_assert!(self.consensus_timeouts.remove(&consensus).is_some(), 572 | "missing timeout: {:?}", 573 | timeout); 574 | let mut actions = Actions::new(); 575 | self.consensus.apply_timeout(consensus, &mut actions); 576 | self.execute_actions(event_loop, actions); 577 | } 578 | 579 | ServerTimeout::Reconnect(token) => { 580 | scoped_assert!(self.reconnection_timeouts.remove(&token).is_some(), 581 | "{:?} missing timeout: {:?}", 582 | self.connections[token], 583 | timeout); 584 | let local_addr = self.listener.local_addr(); 585 | scoped_assert!(local_addr.is_ok(), "could not obtain listener address"); 586 | let id = match *self.connections[token].kind() { 587 | ConnectionKind::Peer(id) => id, 588 | _ => unreachable!(), 589 | }; 590 | let addr = *self.connections[token].addr(); 591 | self.connections[token] 592 | .reconnect_peer(self.id, &local_addr.unwrap()) 593 | .and_then(|_| self.connections[token].register(event_loop, token)) 594 | .map(|_| { 595 | let mut actions = Actions::new(); 596 | self.consensus.peer_connection_reset(id, addr, &mut actions); 597 | self.execute_actions(event_loop, actions); 598 | }) 599 | .unwrap_or_else(|error| { 600 | scoped_warn!("unable to reconnect connection {:?}: {}", 601 | self.connections[token], 602 | error); 603 | self.reset_connection(event_loop, token); 604 | }); 605 | } 606 | } 607 | } 608 | } 609 | 610 | impl fmt::Debug for Server 611 | where L: Log, 612 | M: StateMachine 613 | { 614 | fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { 615 | write!(fmt, "Server({})", self.id) 616 | } 617 | } 618 | 619 | #[cfg(test)] 620 | mod tests { 621 | 622 | extern crate env_logger; 623 | 624 | use std::collections::HashMap; 625 | use std::io::{self, Read, Write}; 626 | use std::net::{SocketAddr, TcpListener, TcpStream}; 627 | use std::str::FromStr; 628 | 629 | use capnp::message::ReaderOptions; 630 | use capnp::serialize; 631 | use mio::EventLoop; 632 | 633 | use ClientId; 634 | use Result; 635 | use ServerId; 636 | use messages; 637 | use messages_capnp::connection_preamble; 638 | use consensus::Actions; 639 | use state_machine::NullStateMachine; 640 | use persistent_log::MemLog; 641 | use super::*; 642 | 643 | type TestServer = Server; 644 | 645 | fn new_test_server(peers: HashMap) 646 | -> Result<(TestServer, EventLoop)> { 647 | let mut server = try!(Server::new(ServerId::from(0), 648 | SocketAddr::from_str("127.0.0.1:0").unwrap(), 649 | MemLog::new(), 650 | NullStateMachine) 651 | .with_peers(peers) 652 | .with_election_min_millis(1500) 653 | .with_election_max_millis(3000) 654 | .with_heartbeat_millis(1000) 655 | .with_max_connections(129) 656 | .finalize()); 657 | let event_loop = try!(server.start_loop()); 658 | Ok((server, event_loop)) 659 | } 660 | 661 | /// Attempts to grab a local, unbound socket address for testing. 662 | fn get_unbound_address() -> SocketAddr { 663 | TcpListener::bind("127.0.0.1:0").unwrap().local_addr().unwrap() 664 | } 665 | 666 | /// Verifies that the proved stream has been sent a valid connection 667 | /// preamble. 668 | fn read_server_preamble(read: &mut R) -> ServerId 669 | where R: Read 670 | { 671 | let message = serialize::read_message(read, ReaderOptions::new()).unwrap(); 672 | let preamble = message.get_root::().unwrap(); 673 | 674 | match preamble.get_id().which().unwrap() { 675 | connection_preamble::id::Which::Server(peer) => ServerId::from(peer.unwrap().get_id()), 676 | _ => { 677 | panic!("unexpected preamble id"); 678 | } 679 | } 680 | } 681 | 682 | /// Returns true if the server has an open connection with the peer. 683 | fn peer_connected(server: &TestServer, peer: ServerId) -> bool { 684 | let token = server.peer_tokens[&peer]; 685 | server.reconnection_timeouts.get(&token).is_none() 686 | } 687 | 688 | /// Returns true if the server has an open connection with the client. 689 | fn client_connected(server: &TestServer, client: ClientId) -> bool { 690 | server.client_tokens.contains_key(&client) 691 | } 692 | 693 | /// Returns true if the provided TCP connection has been shutdown. 694 | /// 695 | /// TODO: figure out a more robust way to implement this, the current check 696 | /// will block the thread indefinitely if the stream is not shutdown. 697 | fn stream_shutdown(stream: &mut TcpStream) -> bool { 698 | let mut buf = [0u8; 128]; 699 | // OS X returns a read of 0 length for closed sockets. 700 | // Linux returns an errcode 104: Connection reset by peer. 701 | match stream.read(&mut buf) { 702 | Ok(0) => true, 703 | Err(ref error) if error.kind() == io::ErrorKind::ConnectionReset => true, 704 | Err(ref error) => panic!("unexpected error: {}", error), 705 | _ => false, 706 | } 707 | } 708 | 709 | /// Tests that a Server will reject an invalid peer configuration set. 710 | #[test] 711 | fn test_illegal_peer_set() { 712 | setup_test!("test_illegal_peer_set"); 713 | let peer_id = ServerId::from(0); 714 | let mut peers = HashMap::new(); 715 | peers.insert(peer_id, SocketAddr::from_str("127.0.0.1:0").unwrap()); 716 | assert!(new_test_server(peers).is_err()); 717 | } 718 | 719 | /// Tests that a Server connects to peer at startup, and reconnects when the 720 | /// connection is dropped. 721 | #[test] 722 | fn test_peer_connect() { 723 | setup_test!("test_peer_connect"); 724 | let peer_id = ServerId::from(1); 725 | 726 | let peer_listener = TcpListener::bind("127.0.0.1:0").unwrap(); 727 | 728 | let mut peers = HashMap::new(); 729 | peers.insert(peer_id, peer_listener.local_addr().unwrap()); 730 | let (mut server, mut event_loop) = new_test_server(peers).unwrap(); 731 | 732 | // Accept the server's connection. 733 | let (mut stream, _) = peer_listener.accept().unwrap(); 734 | 735 | // Check that the server sends a valid preamble. 736 | assert_eq!(ServerId::from(0), read_server_preamble(&mut stream)); 737 | assert!(peer_connected(&server, peer_id)); 738 | 739 | // Drop the connection. 740 | drop(stream); 741 | event_loop.run_once(&mut server, None).unwrap(); 742 | assert!(!peer_connected(&server, peer_id)); 743 | 744 | // Check that the server reconnects after a timeout. 745 | event_loop.run_once(&mut server, None).unwrap(); 746 | assert!(peer_connected(&server, peer_id)); 747 | let (mut stream, _) = peer_listener.accept().unwrap(); 748 | 749 | // Check that the server sends a valid preamble after the connection is 750 | // established. 751 | assert_eq!(ServerId::from(0), read_server_preamble(&mut stream)); 752 | assert!(peer_connected(&server, peer_id)); 753 | } 754 | 755 | /// Tests that a Server will replace a peer's TCP connection if the peer 756 | /// connects through another TCP connection. 757 | #[test] 758 | fn test_peer_accept() { 759 | setup_test!("test_peer_accept"); 760 | let peer_id = ServerId::from(1); 761 | 762 | let peer_listener = TcpListener::bind("127.0.0.1:0").unwrap(); 763 | 764 | let mut peers = HashMap::new(); 765 | peers.insert(peer_id, peer_listener.local_addr().unwrap()); 766 | let (mut server, mut event_loop) = new_test_server(peers).unwrap(); 767 | 768 | // Accept the server's connection. 769 | let (mut in_stream, _) = peer_listener.accept().unwrap(); 770 | 771 | // Check that the server sends a valid preamble. 772 | assert_eq!(ServerId::from(0), read_server_preamble(&mut in_stream)); 773 | assert!(peer_connected(&server, peer_id)); 774 | 775 | let server_addr = server.listener.local_addr().unwrap(); 776 | 777 | // Open a replacement connection to the server. 778 | let mut out_stream = TcpStream::connect(server_addr).unwrap(); 779 | event_loop.run_once(&mut server, None).unwrap(); 780 | 781 | // This is what the new peer tells the server is listening address is. 782 | let fake_peer_addr = SocketAddr::from_str("192.168.0.1:12345").unwrap(); 783 | // Send server the preamble message to the server. 784 | serialize::write_message(&mut out_stream, 785 | &*messages::server_connection_preamble(peer_id, &fake_peer_addr)) 786 | .unwrap(); 787 | out_stream.flush().unwrap(); 788 | event_loop.run_once(&mut server, None).unwrap(); 789 | 790 | // Make sure that reconnecting updated the peer address 791 | // known to `Consensus` with the one given in the preamble. 792 | assert_eq!(server.consensus.peers()[&peer_id], fake_peer_addr); 793 | // Check that the server has closed the old connection. 794 | assert!(stream_shutdown(&mut in_stream)); 795 | // Check that there's a connection which has the fake address 796 | // stored for reconnection purposes. 797 | assert!(server.connections.iter().any(|conn| conn.addr().port() == 12345)) 798 | } 799 | 800 | /// Tests that the server will accept a client connection, then disposes of 801 | /// it when the client disconnects. 802 | #[test] 803 | fn test_client_accept() { 804 | setup_test!("test_client_accept"); 805 | 806 | let (mut server, mut event_loop) = new_test_server(HashMap::new()).unwrap(); 807 | 808 | // Connect to the server. 809 | let server_addr = server.listener.local_addr().unwrap(); 810 | let mut stream = TcpStream::connect(server_addr).unwrap(); 811 | event_loop.run_once(&mut server, None).unwrap(); 812 | 813 | let client_id = ClientId::new(); 814 | 815 | // Send the client preamble message to the server. 816 | serialize::write_message(&mut stream, 817 | &*messages::client_connection_preamble(client_id)) 818 | .unwrap(); 819 | stream.flush().unwrap(); 820 | event_loop.run_once(&mut server, None).unwrap(); 821 | 822 | // Check that the server holds on to the client connection. 823 | assert!(client_connected(&server, client_id)); 824 | 825 | // Check that the server disposes of the client connection when the TCP 826 | // stream is dropped. 827 | drop(stream); 828 | event_loop.run_once(&mut server, None).unwrap(); 829 | assert!(!client_connected(&server, client_id)); 830 | } 831 | 832 | /// Tests that the server will throw away connections that do not properly 833 | /// send a preamble. 834 | #[test] 835 | fn test_invalid_accept() { 836 | setup_test!("test_invalid_accept"); 837 | 838 | let (mut server, mut event_loop) = new_test_server(HashMap::new()).unwrap(); 839 | 840 | // Connect to the server. 841 | let server_addr = server.listener.local_addr().unwrap(); 842 | let mut stream = TcpStream::connect(server_addr).unwrap(); 843 | event_loop.run_once(&mut server, None).unwrap(); 844 | 845 | // Send an invalid preamble. 846 | stream.write(b"foo bar baz").unwrap(); 847 | stream.flush().unwrap(); 848 | event_loop.run_once(&mut server, None).unwrap(); 849 | 850 | // Check that the server disposes of the connection. 851 | assert!(stream_shutdown(&mut stream)); 852 | } 853 | 854 | /// Tests that the server will reset a peer connection when an invalid 855 | /// message is received. 856 | #[test] 857 | fn test_invalid_peer_message() { 858 | setup_test!("test_invalid_peer_message"); 859 | 860 | let peer_id = ServerId::from(1); 861 | 862 | let peer_listener = TcpListener::bind("127.0.0.1:0").unwrap(); 863 | 864 | let mut peers = HashMap::new(); 865 | peers.insert(peer_id, peer_listener.local_addr().unwrap()); 866 | let (mut server, mut event_loop) = new_test_server(peers).unwrap(); 867 | 868 | // Accept the server's connection. 869 | let (mut stream_a, _) = peer_listener.accept().unwrap(); 870 | 871 | // Read the server's preamble. 872 | assert_eq!(ServerId::from(0), read_server_preamble(&mut stream_a)); 873 | 874 | // Send an invalid message. 875 | stream_a.write(b"foo bar baz").unwrap(); 876 | stream_a.flush().unwrap(); 877 | event_loop.run_once(&mut server, None).unwrap(); 878 | 879 | // Check that the server resets the connection. 880 | assert!(!peer_connected(&server, peer_id)); 881 | 882 | // Check that the server reconnects after a timeout. 883 | event_loop.run_once(&mut server, None).unwrap(); 884 | assert!(peer_connected(&server, peer_id)); 885 | } 886 | 887 | /// Tests that the server will reset a client connection when an invalid 888 | /// message is received. 889 | #[test] 890 | fn test_invalid_client_message() { 891 | setup_test!("test_invalid_client_message"); 892 | 893 | let (mut server, mut event_loop) = new_test_server(HashMap::new()).unwrap(); 894 | 895 | // Connect to the server. 896 | let server_addr = server.listener.local_addr().unwrap(); 897 | let mut stream = TcpStream::connect(server_addr).unwrap(); 898 | event_loop.run_once(&mut server, None).unwrap(); 899 | 900 | let client_id = ClientId::new(); 901 | 902 | // Send the client preamble message to the server. 903 | serialize::write_message(&mut stream, 904 | &*messages::client_connection_preamble(client_id)) 905 | .unwrap(); 906 | stream.flush().unwrap(); 907 | event_loop.run_once(&mut server, None).unwrap(); 908 | 909 | // Check that the server holds on to the client connection. 910 | assert!(client_connected(&server, client_id)); 911 | 912 | // Send an invalid client message to the server. 913 | stream.write(b"foo bar baz").unwrap(); 914 | stream.flush().unwrap(); 915 | event_loop.run_once(&mut server, None).unwrap(); 916 | 917 | // Check that the server disposes of the client connection. 918 | assert!(!client_connected(&server, client_id)); 919 | } 920 | 921 | /// Tests that a Server will attempt to connect to peers on startup, and 922 | /// immediately reset the connection if unreachable. 923 | #[test] 924 | fn test_unreachable_peer() { 925 | setup_test!("test_unreachable_peer_reconnect"); 926 | let peer_id = ServerId::from(1); 927 | let mut peers = HashMap::new(); 928 | peers.insert(peer_id, get_unbound_address()); 929 | 930 | // Creates the Server, which registers the peer connection, and 931 | // immediately resets it. 932 | let (mut server, _) = new_test_server(peers).unwrap(); 933 | assert!(!peer_connected(&mut server, peer_id)); 934 | } 935 | 936 | /// Tests that the server will send a message to a peer connection. 937 | #[test] 938 | fn test_connection_send() { 939 | setup_test!("test_connection_send"); 940 | let peer_id = ServerId::from(1); 941 | 942 | let peer_listener = TcpListener::bind("127.0.0.1:0").unwrap(); 943 | 944 | let mut peers = HashMap::new(); 945 | let peer_addr = peer_listener.local_addr().unwrap(); 946 | peers.insert(peer_id, peer_addr); 947 | let (mut server, mut event_loop) = new_test_server(peers).unwrap(); 948 | 949 | // Accept the server's connection. 950 | let (mut in_stream, _) = peer_listener.accept().unwrap(); 951 | 952 | // Accept the preamble. 953 | assert_eq!(ServerId::from(0), read_server_preamble(&mut in_stream)); 954 | 955 | // Send a test message (the type is not important). 956 | let mut actions = Actions::new(); 957 | actions.peer_messages 958 | .push((peer_id, messages::server_connection_preamble(peer_id, &peer_addr))); 959 | server.execute_actions(&mut event_loop, actions); 960 | 961 | assert_eq!(peer_id, read_server_preamble(&mut in_stream)); 962 | } 963 | } 964 | -------------------------------------------------------------------------------- /src/state.rs: -------------------------------------------------------------------------------- 1 | use std::collections::{HashMap, HashSet, VecDeque}; 2 | 3 | use ClientId; 4 | use LogIndex; 5 | use ServerId; 6 | 7 | /// Consensus modules can be in one of three state: 8 | /// 9 | /// * `Follower` - which replicates AppendEntries requests and votes for it's leader. 10 | /// * `Leader` - which leads the cluster by serving incoming requests, ensuring 11 | /// data is replicated, and issuing heartbeats. 12 | /// * `Candidate` - which campaigns in an election and may become a `Leader` 13 | /// (if it gets enough votes) or a `Follower`, if it hears from 14 | /// a `Leader`. 15 | #[derive(Clone, Debug, PartialEq, Eq)] 16 | pub enum ConsensusState { 17 | Follower, 18 | Candidate, 19 | Leader, 20 | } 21 | 22 | /// The state associated with a Raft consensus module in the `Leader` state. 23 | #[derive(Clone, Debug)] 24 | pub struct LeaderState { 25 | next_index: HashMap, 26 | match_index: HashMap, 27 | /// Stores in-flight client proposals. 28 | pub proposals: VecDeque<(ClientId, LogIndex)>, 29 | } 30 | 31 | impl LeaderState { 32 | /// Returns a new `LeaderState` struct. 33 | /// 34 | /// # Arguments 35 | /// 36 | /// * `latest_log_index` - The index of the leader's most recent log entry at the 37 | /// time of election. 38 | /// * `peers` - The set of peer cluster members. 39 | pub fn new(latest_log_index: LogIndex, peers: &HashSet) -> LeaderState { 40 | let next_index = peers.iter().cloned().map(|peer| (peer, latest_log_index + 1)).collect(); 41 | let match_index = peers.iter().cloned().map(|peer| (peer, LogIndex::from(0))).collect(); 42 | 43 | LeaderState { 44 | next_index: next_index, 45 | match_index: match_index, 46 | proposals: VecDeque::new(), 47 | } 48 | } 49 | 50 | /// Returns the next log entry index of the follower. 51 | pub fn next_index(&mut self, follower: &ServerId) -> LogIndex { 52 | self.next_index[follower] 53 | } 54 | 55 | /// Sets the next log entry index of the follower. 56 | pub fn set_next_index(&mut self, follower: ServerId, index: LogIndex) { 57 | self.next_index.insert(follower, index); 58 | } 59 | 60 | /// Sets the index of the highest log entry known to be replicated on the 61 | /// follower. 62 | pub fn set_match_index(&mut self, follower: ServerId, index: LogIndex) { 63 | self.match_index.insert(follower, index); 64 | } 65 | 66 | /// Counts the number of followers containing the given log index. 67 | pub fn count_match_indexes(&self, index: LogIndex) -> usize { 68 | // +1 for self. 69 | self.match_index.values().filter(|&&i| i >= index).count() + 1 70 | } 71 | 72 | /// Reinitializes the state following an election. 73 | pub fn reinitialize(&mut self, latest_log_index: LogIndex) { 74 | for next_index in self.next_index.values_mut() { 75 | *next_index = latest_log_index + 1; 76 | } 77 | for match_index in self.match_index.values_mut() { 78 | *match_index = LogIndex::from(0); 79 | } 80 | self.proposals.clear(); 81 | } 82 | } 83 | 84 | /// The state associated with a Raft consensus module in the `Candidate` state. 85 | #[derive(Clone, Debug)] 86 | pub struct CandidateState { 87 | granted_votes: HashSet, 88 | } 89 | 90 | impl CandidateState { 91 | /// Creates a new `CandidateState`. 92 | pub fn new() -> CandidateState { 93 | CandidateState { granted_votes: HashSet::new() } 94 | } 95 | 96 | /// Records a vote from `voter`. 97 | pub fn record_vote(&mut self, voter: ServerId) { 98 | self.granted_votes.insert(voter); 99 | } 100 | 101 | /// Returns the number of votes. 102 | pub fn count_votes(&self) -> usize { 103 | self.granted_votes.len() 104 | } 105 | 106 | /// Clears the vote count. 107 | pub fn clear(&mut self) { 108 | self.granted_votes.clear(); 109 | } 110 | 111 | /// Returns whether the peer has voted in the current election. 112 | pub fn peer_voted(&self, voter: ServerId) -> bool { 113 | self.granted_votes.contains(&voter) 114 | } 115 | } 116 | 117 | /// The state associated with a Raft consensus module in the `Follower` state. 118 | #[derive(Clone, Debug)] 119 | pub struct FollowerState { 120 | /// The most recent leader of the follower. The leader is not guaranteed to be active, so this 121 | /// should only be used as a hint. 122 | pub leader: Option, 123 | /// The minimal index at which entries can be appended. This bit of state 124 | /// allows avoiding overwriting of possibly committed parts of the log 125 | /// when messages arrive out of order. It is reset on set_leader() and 126 | /// otherwise left untouched. 127 | /// See see ktoso/akka-raft#66. 128 | pub min_index: LogIndex, 129 | } 130 | 131 | impl FollowerState { 132 | /// Returns a new `FollowerState`. 133 | pub fn new() -> FollowerState { 134 | FollowerState { 135 | leader: None, 136 | min_index: LogIndex(0), 137 | } 138 | } 139 | 140 | /// Sets a new leader. 141 | pub fn set_leader(&mut self, leader: ServerId) { 142 | self.leader = Some(leader); 143 | self.min_index = LogIndex(0); 144 | } 145 | } 146 | 147 | #[cfg(test)] 148 | mod tests { 149 | use std::collections::HashSet; 150 | 151 | use {LogIndex, ServerId}; 152 | use state::LeaderState; 153 | 154 | /// Tests the `LeaderState`'s `.count_match_indexes()` function and makes sure it adequately 155 | /// produces the correct values. 156 | #[test] 157 | fn test_count_match_indexes() { 158 | let index = LogIndex(0); 159 | let mut peers = HashSet::new(); 160 | 161 | // All peers start at 0 index. 162 | let leader_state = LeaderState::new(index, &peers); 163 | // Should be one, since the leader node would be matched always. 164 | assert_eq!(1, leader_state.count_match_indexes(LogIndex(0))); 165 | 166 | peers.insert(ServerId(1)); 167 | let leader_state = LeaderState::new(index, &peers); 168 | assert_eq!(2, leader_state.count_match_indexes(LogIndex(0))); 169 | 170 | peers.insert(ServerId(2)); 171 | let leader_state = LeaderState::new(index, &peers); 172 | assert_eq!(3, leader_state.count_match_indexes(LogIndex(0))); 173 | 174 | peers.insert(ServerId(3)); 175 | let mut leader_state = LeaderState::new(index, &peers); 176 | assert_eq!(4, leader_state.count_match_indexes(LogIndex(0))); 177 | 178 | leader_state.set_match_index(ServerId(1), LogIndex(1)); 179 | leader_state.set_match_index(ServerId(2), LogIndex(1)); 180 | assert_eq!(3, leader_state.count_match_indexes(LogIndex(1))); 181 | } 182 | } 183 | -------------------------------------------------------------------------------- /src/state_machine/channel.rs: -------------------------------------------------------------------------------- 1 | use std::fmt::{self, Debug}; 2 | use std::sync::mpsc; 3 | 4 | use state_machine::StateMachine; 5 | 6 | 7 | /// A state machine that simply redirects all commands to a channel. 8 | /// 9 | /// This state machine is chiefly meant for testing. 10 | pub struct ChannelStateMachine { 11 | tx: mpsc::Sender>, 12 | } 13 | 14 | impl ChannelStateMachine { 15 | pub fn new() -> (ChannelStateMachine, mpsc::Receiver>) { 16 | let (tx, recv) = mpsc::channel(); 17 | (ChannelStateMachine { tx: tx }, recv) 18 | } 19 | } 20 | 21 | impl StateMachine for ChannelStateMachine { 22 | fn apply(&mut self, command: &[u8]) -> Vec { 23 | self.tx 24 | .send(command.to_vec()) 25 | .map(|_| Vec::new()) 26 | .unwrap_or_else(|_| b"An error occured."[..].into()) 27 | } 28 | 29 | fn query(&self, _query: &[u8]) -> Vec { 30 | unimplemented!() 31 | } 32 | 33 | fn snapshot(&self) -> Vec { 34 | Vec::new() 35 | } 36 | 37 | fn restore_snapshot(&mut self, _snapshot: Vec) -> () { 38 | () 39 | } 40 | } 41 | 42 | impl Debug for ChannelStateMachine { 43 | fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { 44 | write!(fmt, "ChannelStateMachine") 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/state_machine/mod.rs: -------------------------------------------------------------------------------- 1 | //! A `StateMachine` is a single instance of a distributed application. It is the `raft` libraries 2 | //! responsibility to take commands from the `Client` and apply them to each `StateMachine` 3 | //! instance in a globally consistent order. 4 | //! 5 | //! The `StateMachine` is interface is intentionally generic so that any distributed application 6 | //! needing consistent state can be built on it. For instance, a distributed hash table 7 | //! application could implement `StateMachine`, with commands corresponding to `insert`, and 8 | //! `remove`. The `raft` library would guarantee that the same order of `insert` and `remove` 9 | //! commands would be seen by all consensus modules. 10 | use std::fmt::Debug; 11 | 12 | mod channel; 13 | mod null; 14 | 15 | pub use state_machine::channel::ChannelStateMachine; 16 | pub use state_machine::null::NullStateMachine; 17 | 18 | /// This trait is meant to be implemented such that the commands issued to it via `apply()` will 19 | /// be reflected in your consuming application. Commands sent via `apply()` have been committed 20 | /// in the cluser. Unlike `store`, your application should consume data produced by this and 21 | /// accept it as truth. 22 | /// 23 | /// Note that you are responsible for **not crashing** the state machine. Your production 24 | /// implementation should not use `.unwrap()`, `.expect()` or anything else that likes to `panic!()` 25 | pub trait StateMachine: Debug + Send + 'static { 26 | /// Applies a command to the state machine. 27 | /// Returns an application-specific result value. 28 | fn apply(&mut self, command: &[u8]) -> Vec; 29 | 30 | /// Queries a value of the state machine. Does not go through the durable log, or mutate the 31 | /// state machine. 32 | /// Returns an application-specific result value. 33 | fn query(&self, query: &[u8]) -> Vec; 34 | 35 | /// Take a snapshot of the state machine. 36 | fn snapshot(&self) -> Vec; 37 | 38 | /// Restore a snapshot of the state machine. 39 | fn restore_snapshot(&mut self, snapshot: Vec) -> (); 40 | } 41 | -------------------------------------------------------------------------------- /src/state_machine/null.rs: -------------------------------------------------------------------------------- 1 | use state_machine::StateMachine; 2 | 3 | /// A state machine with no states. 4 | #[derive(Debug)] 5 | pub struct NullStateMachine; 6 | 7 | impl StateMachine for NullStateMachine { 8 | fn apply(&mut self, _command: &[u8]) -> Vec { 9 | Vec::new() 10 | } 11 | 12 | fn query(&self, _query: &[u8]) -> Vec { 13 | Vec::new() 14 | } 15 | 16 | fn snapshot(&self) -> Vec { 17 | Vec::new() 18 | } 19 | 20 | fn restore_snapshot(&mut self, _snapshot: Vec) { 21 | () 22 | } 23 | } 24 | --------------------------------------------------------------------------------