├── LICENSE
├── Makefile
├── README.md
├── abstract
    └── abstract.tex
├── ack
    └── ack.tex
├── acmcaps.bst
├── basicraft
    ├── cheatsheet.pdf
    ├── conclusion.tex
    ├── consensus.tex
    ├── diverge2.svg
    ├── figproperties.tex
    ├── followercandidateleader.svg
    ├── intro.tex
    ├── log2.svg
    ├── oldTermCommit.svg
    ├── raft.tex
    ├── safety2.svg
    ├── terms.svg
    └── understandability.tex
├── book.pdf
├── cc-by.svg
├── clients
    ├── cheatsheet.pdf
    ├── clients.tex
    ├── leases.svg
    └── retrydup.svg
├── compaction
    ├── Makefrag
    ├── cheatsheet.pdf
    ├── cleaning.svg
    ├── compaction.tex
    ├── conclusion.tex
    ├── disksnapshot.tex
    ├── incremental.tex
    ├── intro.tex
    ├── leader.tex
    ├── logbased.svg
    ├── memsnapshot.tex
    ├── rules.pdf
    └── snapshot.svg
├── conclusion
    └── conclusion.tex
├── correctness
    └── correctness.tex
├── intro
    └── intro.tex
├── leaderelection
    ├── benchmarks-randomness.svg
    ├── benchmarks-scale.svg
    ├── conclusion.tex
    ├── distance.svg
    ├── earliesttimeout.svg
    ├── earliesttimeoutdiagram.svg
    ├── leaderelection.tex
    ├── logsdiff.tex
    ├── multi-submission-failures-logsdiff.svg
    ├── multi-submission-failures.svg
    ├── nosplit.tex
    ├── overall.svg
    ├── prevote.tex
    ├── real.tex
    ├── splitrate.tex
    ├── splits.svg
    ├── splittotal.tex
    ├── splitvotediagram.svg
    └── timeline.svg
├── local.bib
├── main.tex
├── membership
    ├── Makefrag
    ├── arbitrary.tex
    ├── availability.tex
    ├── catchup2.svg
    ├── catchup3.svg
    ├── cheatsheet2.pdf
    ├── disruptive.svg
    ├── membership.tex
    ├── reconfigurationconf.svg
    ├── reconfigurationdifficulty.svg
    ├── removedlog3.svg
    ├── safety.tex
    └── special.svg
├── motivation
    ├── activeactive.svg
    ├── activepassive.svg
    ├── bigdata.svg
    ├── motivation.tex
    ├── paxos.tex
    ├── paxossummary.pdf
    ├── problem.tex
    ├── statemachine.svg
    └── uses.tex
├── online-trim.pdf
├── online.pdf
├── performance
    ├── Makefrag
    ├── implementation.tex
    ├── latency.svg
    ├── performance.tex
    ├── pipeline.svg
    ├── threads.svg
    └── throughput.svg
├── proof
    ├── defs.tex
    ├── proof.tex
    ├── rafttla.tex
    └── tlatex.sty
├── related
    ├── Makefrag
    ├── algos.tex
    ├── compaction.tex
    ├── correctness.tex
    ├── leaderelection.tex
    ├── logreplication.tex
    ├── membership.tex
    ├── performance.tex
    ├── primarybackup-rsm.svg
    ├── primarybackup.tex
    ├── related.tex
    ├── rereplicate.svg
    └── understandability.tex
├── stanford.pdf
├── suthesis-2e-mod.sty
├── userstudy
    ├── breakdown.svg
    ├── breakdownlegend.svg
    ├── diffBreak.svg
    ├── order.svg
    ├── pairedcdf.svg
    ├── pairedscatter.svg
    ├── pairedscatterpaxos.svg
    ├── stylusoverlay.png
    ├── survey.svg
    ├── surveyfair.svg
    ├── surveypaxos.svg
    ├── unpairedcdf.svg
    └── userstudy.tex
└── userstudymaterials
    ├── committed.png
    ├── inconsistency.png
    ├── legala.png
    ├── legalb.png
    ├── legalc.png
    ├── legald.png
    ├── paxosLoga.png
    ├── paxosLogb.png
    ├── paxosLogc.png
    ├── paxosLogd.png
    ├── paxossummary.pdf
    ├── raftsummary.pdf
    └── userstudyquizzes.tex


/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2014 Diego Ongaro.
 2 | 
 3 | This work is licensed under:
 4 | - Creative Commons Attribution-3.0 United States License
 5 |   http://creativecommons.org/licenses/by/3.0/us/ , or
 6 | - Creative Commons Attribution-4.0 International License
 7 |   https://creativecommons.org/licenses/by/4.0/ .
 8 | 
 9 | 
10 | This dissertation expands on a paper written by Diego Ongaro and
11 | John Ousterhout:
12 |   D. Ongaro and J. Ousterhout,
13 |   "In Search of an Understandable Consensus Algorithm,"
14 |   2014 USENIX Annual Technical Conference (USENIX ATC '14),
15 |   June 2014, pp. 305-319.
16 | Most of the paper's content is included in some form in this dissertation. It
17 | is reproduced in this dissertation and licensed under the Creative Commons
18 | Attribution licenses with permission from John Ousterhout.
19 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | main.pdf:
 2 | 
 3 | LATEX = pdflatex -interaction=nonstopmode
 4 | BIBTEX = bibtex -min-crossrefs=1000
 5 | RERUN = ! egrep -q '(^LaTeX Warning:|\(natbib\)).* Rerun'
 6 | UNDEFINED = ! egrep -q '^(LaTeX|Package natbib) Warning:.* undefined'
 7 | 
 8 | FIGPNG = $(wildcard *.png */*.png)
 9 | FIGSVG = $(wildcard *.svg */*.svg)
10 | FIGGENPDF = $(patsubst %.svg,%.pdf,$(FIGSVG))
11 | 
12 | include */Makefrag
13 | 
14 | main.pdf: *.tex */*.tex *.sty *.bib $(FIGPNG) $(FIGGENPDF)
15 | 	$(LATEX)  main.tex || (rm -f main.pdf && false)
16 | 	$(BIBTEX) main.aux || (rm -f main.pdf && false)
17 | 	$(LATEX)  main.tex || (rm -f main.pdf && false)
18 | 	$(LATEX)  main.tex || (rm -f main.pdf && false)
19 | 	$(LATEX)  main.tex || (rm -f main.pdf && false)
20 | 	$(RERUN) main.log
21 | 	$(UNDEFINED) main.log
22 | 
23 | %.pdf: %.svg
24 | 	inkscape -T -z -A $@ $<
25 | 
26 | IGNORE = \
27 | 	 *.aux \
28 | 	 */*.aux \
29 | 	 main.aux \
30 | 	 main.bbl \
31 | 	 main.blg \
32 | 	 main.brf \
33 | 	 main.lof \
34 | 	 main.log \
35 | 	 main.lot \
36 | 	 main.pdf \
37 | 	 main.out \
38 | 	 main.toc \
39 | 	 $(FIGGENPDF)
40 | 
41 | .PHONY: clean
42 | clean:
43 | 	rm -f $(IGNORE)
44 | 
45 | .PHONY: .gitignore
46 | .gitignore:
47 | 	echo .gitignore $(IGNORE) | sed 's/ /\n/g' > .gitignore
48 | 
49 | # The following target is useful for debugging Makefiles; it
50 | # prints the value of a make variable.
51 | print-%:
52 | 	@echo $* = $($*)
53 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Consensus: Bridging Theory and Practice
 2 | 
 3 | This repository contains Diego Ongaro's PhD dissertation, *Consensus: Bridging
 4 | Theory and Practice*, published by Stanford University in 2014. The
 5 | dissertation PDFs and the dissertation sources are licensed under the Creative
 6 | Commons Attribution license, as described in the [LICENSE](LICENSE) file.
 7 | 
 8 | Several pre-built PDFs are included:
 9 | 
10 |  - [book.pdf](book.pdf?raw=true): formatted for a printed book
11 |    (8.5x11" pages, bigger inside margins for binding, black hyperlinks)
12 |  - [online.pdf](online.pdf?raw=true): formatted for normal digital or print use
13 |    (8.5x11" pages, consistent margins on all pages, blue hyperlinks)
14 |  - [online-trim.pdf](online-trim.pdf?raw=true): formatted for digital viewing
15 |    (6.6x9.35" pages, minimal margins, blue hyperlinks)
16 |  - [stanford.pdf](stanford.pdf?raw=true): exact file distributed by
17 |    Stanford University
18 |    (similar to `online.pdf`; copyright, signature, and preface pages differ)
19 | 
20 | All of these use the same page numbers starting at page 1, though `book.pdf` has
21 | an additional blank page before the introduction (page xviii).
22 | 
23 | The source materials for the dissertation are made available here in the hopes
24 | that they might be useful, for example, to reformat the dissertation for a
25 | different medium or to copy sections for use in other documents (per the
26 | [LICENSE](LICENSE)). It requires the following to build:
27 | 
28 |  - GNU make
29 |  - pdflatex
30 |  - bibtex
31 |  - Inkscape (to convert SVG images and layers from SVG images to PDF format)
32 | 
33 | ## Updates and Errata
34 | 
35 | ### Chapter 3: Basic Raft algorithm
36 | 
37 | - Figure 3.1 (cheatsheet): Although lastApplied is listed as volatile state, it should be as volatile as the state machine. If the state machine is volatile, lastApplied should be volatile. If the state machine is persistent, lastApplied should be just as persistent.
38 | 
39 | ### Chapter 4: Cluster membership changes
40 | 
41 | - There's an **important bug** in single-server changes, fortunately with an easy fix. See the [raft-dev post](https://groups.google.com/d/msg/raft-dev/t4xj6dJTP6E/d2D9LrWRza8J).
42 | 
43 | ### Chapter 6: Client interaction
44 | 
45 | - "it would extends its lease" should read "it would extend its lease" (Figure 6.3 caption).
46 | 
47 | ### Chapter 7: Raft user study / Appendix A: User study materials
48 | 
49 | - "Log Completeness" should read "Leader Completeness" (3x).
50 | 
51 | ### Chapter 8: Correctness / Appendix B: Safety proof and formal specification
52 | 
53 | - The Verdi team at the University of Washington has completed a machine-checked proof of safety for the basic Raft algorithm in Coq. See the [press release](https://news.cs.washington.edu/2015/08/07/uw-cses-verdi-team-completes-first-full-formal-verification-of-raft-consensus-protocol/).
54 | - There have been a few minor updates to the TLA+ specification: <https://github.com/ongardie/raft.tla>.
55 | 
56 | ### Chapter 10: Implementation and performance
57 | 
58 | - The first implementation of pipelining wasn't quite right and never made it into LogCabin's master branch. [Issue 97](https://github.com/logcabin/logcabin/issues/97) tracks this.
59 | 
60 | ### Bibliography
61 | 
62 | - The Raft website has moved to <https://raft.github.io>.
63 | 


--------------------------------------------------------------------------------
/abstract/abstract.tex:
--------------------------------------------------------------------------------
 1 | \prefacesection{Abstract}
 2 | 
 3 | Distributed consensus is fundamental to building fault-tolerant systems.
 4 | It allows a collection of machines to work as a coherent group that can
 5 | survive the failures of some of its members. Unfortunately, the most
 6 | common consensus algorithm, Paxos, is widely regarded as difficult to
 7 | understand and implement correctly.
 8 | 
 9 | This dissertation presents a new consensus algorithm called Raft, which
10 | was designed for understandability. Raft first elects a server as leader,
11 | then concentrates all decision-making onto the leader. These two basic
12 | steps are relatively independent and form a better structure than Paxos,
13 | whose components are hard to separate. Raft elects a leader using voting
14 | and randomized timeouts. The election guarantees that the leader already
15 | stores all the information it needs, so data only flows outwards
16 | from the leader to other servers. Compared to other leader-based
17 | algorithms, this reduces mechanism and simplifies the behavior. Once a
18 | leader is elected, it manages a replicated log. Raft leverages a simple
19 | invariant on how logs grow to reduce the algorithm's state space and
20 | accomplish this task with minimal mechanism.
21 | 
22 | Raft is also more suitable than previous algorithms for real-world
23 | implementations. It performs well enough for practical deployments, and
24 | it addresses all aspects of building a complete system, including how to
25 | manage client interactions, how to change the cluster membership, and
26 | how to compact the log when it grows too large. To change the cluster
27 | membership, Raft allows adding or removing one server at a time (complex
28 | changes can be composed from these basic steps), and the cluster
29 | continues servicing requests throughout the change.
30 | 
31 | We believe that Raft is superior to Paxos and other consensus
32 | algorithms, both for educational purposes and as a foundation for
33 | implementation. Results from a user study demonstrate that Raft is
34 | easier for students to learn than Paxos. The algorithm has been formally
35 | specified and proven, its leader election algorithm
36 | works well in a variety of environments, and its performance is
37 | equivalent to Multi-Paxos. Many implementations of Raft are now
38 | available, and several companies are deploying Raft.
39 | 


--------------------------------------------------------------------------------
/ack/ack.tex:
--------------------------------------------------------------------------------
  1 | \prefacesection{Acknowledgments}
  2 | 
  3 | Thanks to my family and friends for supporting me throughout the ups and
  4 | downs of grad school. Mom, thanks for continuously pushing me to do well
  5 | academically, even when I didn't see the point. I still don't know how
  6 | you got me out of bed at 6~a.m.\ all those mornings. Dad, thanks for
  7 | helping us earn these six (seven?) degrees, and I hope we've made you proud.
  8 | Zeide, I wish I could give
  9 | you a copy of this small book for your collection. Ernesto, thanks for
 10 | sparking my interest in computers; I still think they're pretty cool.
 11 | Laura, I'll let you know if and when I discover a RAMCloud. Thanks for
 12 | listening to hours of my drama, even when you didn't understand the
 13 | nouns. Jenny, thanks for helping me get through the drudgery of writing
 14 | this dissertation and for making me smile the whole way through. You're
 15 | crazy for having wanted to read this, and you're weird for having
 16 | enjoyed it.
 17 | 
 18 | I learned a ton from my many labmates, both in RAMCloud and in SCS.
 19 | Deian, I don't know why you always cared about my work; I never
 20 | understood your passion for that IFC nonsense, but keep simplifying it
 21 | until us mortals can use it. Ankita, you've single-handedly increased
 22 | the lab's average self-esteem and optimism by at least 20\%. I've
 23 | watched you learn so much already; keep absorbing it all, and I hope
 24 | you're able to see how far you've come. Good luck with your role as the
 25 | new Senior Student. Thanks especially to Ryan and Steve, with whom I
 26 | formed the first generation of RAMCloud students. Ryan, believe it or
 27 | not, your optimism helped. You were always excited about wacky ideas,
 28 | and I always looked forward to swapping CSBs (``cool story, bro'') with
 29 | you. You'll make a great advisor. Steve, I miss your intolerance for
 30 | bullshit, and I strive to match your standards for your own engineering
 31 | work. You continuously shocked the rest of us with those silent bursts
 32 | of productivity, where you'd get quarter-long projects done over a
 33 | single weekend. You guys also figured out all the program requirements
 34 | before I did and told me all the tricks. I continue to follow your lead
 35 | even after you've moved on. (Ryan, you incorrectly used the British
 36 | spelling ``acknowledgements'' rather than the American
 37 | ``acknowledgments''. Steve, you did too, but you're just Canadian, not
 38 | wrong.)
 39 | 
 40 | Thanks to the many professors who have advised me along the way. John
 41 | Ousterhout, my Ph.D.\ advisor, should be a coauthor on this dissertation
 42 | (but I don't think they would give me a degree that way). I have never
 43 | learned as much professionally from any other person. 
 44 | John teaches by setting a great example of how to
 45 | code, to evaluate, to design, to think, and to write \emph{well}.
 46 | I have never quite
 47 | been on David \mazieres{}'s same wavelength; he's usually
 48 | \SIrange{10}{30}{minutes}
 49 | ahead in conversation. As soon as I could almost keep up with him
 50 | regarding consensus, he moved on to harder Byzantine consensus problems.
 51 | Nevertheless, David has looked out for me throughout my years in grad
 52 | school, and I've picked up some of his passion for building useful
 53 | systems and, more importantly, having fun doing so.
 54 | Mendel Rosenblum carries intimate knowledge of low level details like
 55 | x86 instruction set, yet also manages to keep track of the big
 56 | picture. He's helped me with both over the years, surprising me
 57 | with how quickly he can solve my technical problems and how clear my
 58 | predicaments are when put into his own words. Thanks to Christos Kozyrakis
 59 | and Stephen Weitzman for serving on my defense committee, and thanks to
 60 | Alan Cox and Scott Rixner for introducing me to research during
 61 | my undergraduate studies at Rice.
 62 | 
 63 | Many people contributed directly to this dissertation work.
 64 | A special thanks goes to David \mazieres{} and Ezra Hoch for each
 65 | finding a bug in earlier versions of Raft. David emailed us one night at
 66 | 2:45~a.m.\ as he was reading through the Raft lecture slides for the
 67 | user study. He wrote that he found ``one thing quite hard to follow in
 68 | the slides,'' which turned out to be a major issue in Raft's safety.
 69 | Ezra found a liveness bug in membership changes. He posted to the
 70 | Raft mailing list,
 71 | ``What if the following happens?''~\cite{Hoch:2014}, and described an
 72 | unfortunate series of events that could leave a cluster unable to
 73 | elect a leader. Thanks also to Hugues Evrard for finding a small
 74 | omission in the formal specification.
 75 | 
 76 | The user study would not have been possible without the support of
 77 | Ali Ghodsi, David Mazi\`{e}res, and the students of CS 294-91 at
 78 | Berkeley and CS 240 at Stanford.
 79 | Scott Klemmer helped us design the user study,
 80 | and Nelson Ray advised us on statistical analysis.
 81 | The Paxos slides for the user study borrowed heavily from a slide
 82 | deck originally created by Lorenzo Alvisi.
 83 | 
 84 | Many people provided feedback on other content in this dissertation.
 85 | In addition to my reading committee, Jennifer Wolochow provided helpful comments
 86 | on the entire dissertation.
 87 | Blake Mizerany, Xiang Li, and Yicheng Qin at CoreOS pushed
 88 | me to simplify the membership change algorithm towards
 89 | single-server changes.
 90 | Anirban Rahut from Splunk pointed out that membership changes may be
 91 | needlessly slow when a server joins with an empty log.
 92 | Laura Ongaro offered helpful feedback on the
 93 | user study chapter. Asaf Cidon helped direct me in finding the
 94 | probability of split votes during elections.
 95 | Eddie Kohler helped clarify the trade-offs in Raft's commitment rule,
 96 | and Maciej Smole\'{n}ski pointed out that because of it, if a leader
 97 | were to restart an unbounded number of times before it could mark entries
 98 | committed, its log could grow without bound (see Chapter~\ref{related}).
 99 | Alexander Shraer helped clarify how membership changes work in Zab.
100 | 
101 | Many people provided helpful feedback on the \name{} paper and user study
102 | materials, including
103 | Ed Bugnion,
104 | Michael Chan,
105 | Hugues Evrard,
106 | Daniel Giffin,
107 | Arjun Gopalan,
108 | Jon Howell,
109 | Vimalkumar Jeyakumar,
110 | Ankita Kejriwal,
111 | Aleksandar Kracun,
112 | Amit Levy,
113 | Joel Martin,
114 | Satoshi Matsushita,
115 | Oleg Pesok,
116 | David Ramos,
117 | Robbert van Renesse,
118 | Mendel Rosenblum,
119 | Nicolas Schiper,
120 | Deian Stefan,
121 | Andrew Stone,
122 | Ryan Stutsman,
123 | David Terei,
124 | Stephen Yang,
125 | Matei Zaharia,
126 | 24 anonymous conference reviewers (with duplicates),
127 | and especially Eddie Kohler for shepherding the Raft paper.
128 | 
129 | Werner Vogels tweeted a link to an early draft of the \name{} paper,
130 | which gave \name{} significant exposure. Ben Johnson and Patrick Van
131 | Stee both gave early talks on \name{} at major industry conferences.
132 | 
133 | This work was supported by the Gigascale Systems Research Center and the
134 | Multiscale Systems Center, two of six research centers funded under the Focus
135 | Center Research Program, a Semiconductor Research Corporation program,
136 | by STARnet, a Semiconductor Research Corporation program sponsored by MARCO
137 | and DARPA, by the National Science Foundation under Grant No.~0963859,
138 | and by grants from Facebook, Google, Mellanox, NEC, NetApp, SAP, and Samsung.
139 | Diego Ongaro was supported by The Junglee Corporation Stanford Graduate
140 | Fellowship. James Myers at Intel donated several SSDs used in
141 | benchmarking.
142 | 


--------------------------------------------------------------------------------
/basicraft/cheatsheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongardie/dissertation/0c10ea1b53d73bf6d163bdde8e4141a1cbd8f36a/basicraft/cheatsheet.pdf


--------------------------------------------------------------------------------
/basicraft/conclusion.tex:
--------------------------------------------------------------------------------
 1 | \section{Conclusion}
 2 | \label{basicraft:conclusion}
 3 | 
 4 | This chapter addressed all the core problems for a consensus-based
 5 | system. Raft goes beyond reaching consensus on a single value, as in
 6 | single-decree Paxos; it achieves consensus on a growing log of commands,
 7 | which is needed to build a replicated state machine. It also
 8 | includes disseminates information once agreement has been reached, so
 9 | that other servers learn the log entries that have been committed. Raft
10 | achieves consensus in a practical and efficient way by electing a
11 | cluster leader to unilaterally make decisions and transmitting only the
12 | necessary log entries when a new leader comes to power. We have
13 | implemented the ideas of Raft in LogCabin, a replicated state machine
14 | (described in Chapter~\ref{performance}).
15 | 
16 | Raft uses only a small amount of mechanism to address the full consensus
17 | problem. For example, it uses only two RPCs (RequestVote and
18 | AppendEntries). Perhaps surprisingly, creating a compact
19 | algorithm/implementation was not an explicit goal for Raft. Rather, it
20 | is a result of our design for understandability, where every bit of
21 | mechanism must be fully motivated and explained. We found that redundant
22 | or meandering mechanism is hard to motivate, so it naturally gets purged
23 | in the design process.
24 | 
25 | Unless we felt confident that a particular problem would affect a large
26 | fraction of Raft deployments, we did not address it in Raft. As a
27 | result, parts of Raft may appear na\"ive. For example, servers in Raft
28 | detect a split vote by waiting for an election timeout; in principle,
29 | they could often detect and even resolve split votes sooner by counting
30 | the votes granted to any candidate. We chose not to develop this
31 | optimization for Raft, since it adds complexity but probably brings no
32 | practical benefit: split votes are rare in a well-configured deployment.
33 | Other parts of Raft may appear overly conservative. For example, a
34 | leader only directly commits an entry from its current term, even though
35 | in some special cases it could safely commit entries from prior terms.
36 | Applying a more complex commitment rule would harm understandability and
37 | would not have a significant effect on performance; commitment is only
38 | delayed briefly with the current rule. In discussing Raft with others,
39 | we found that many people cannot help but think of such optimizations
40 | and propose them, but when the goal is understandability, premature
41 | optimizations should be left out.
42 | 
43 | Inevitably, this chapter might have left out some features or
44 | optimizations that turn out to be useful in practice. As implementers
45 | gain more experience with Raft, they will learn when and why certain
46 | additional features may be useful, and they may need to implement these
47 | for some practical deployments. Throughout the chapter, we sketched a
48 | few optional extensions that we currently think are unnecessary but that
49 | may help guide implementers should the need arise. By focusing on
50 | understandability, we hope to have provided a solid foundation for
51 | implementers to adjust Raft according to their experiences. Since Raft
52 | works in our testing environment, we expect these to be straightforward
53 | extensions rather than fundamental changes.
54 | 


--------------------------------------------------------------------------------
/basicraft/figproperties.tex:
--------------------------------------------------------------------------------
 1 | \begin{figure}
 2 | \centering
 3 | \fbox {
 4 |   \parbox{5.1in}{
 5 |     \small
 6 |     \begin{description}
 7 |     \itemsep 0em
 8 |     \item[\textbf{Election Safety}] \hfill \\
 9 |     At most one leader can be elected in
10 |     a given term.
11 |     \S\ref{basicraft:leaderelection}
12 | 
13 |     \item[\textbf{Leader Append-Only}] \hfill \\
14 |     A leader never overwrites or deletes
15 |     entries in its log; it only appends new entries. \S\ref{basicraft:logreplication}
16 | 
17 |     \item[\textbf{Log Matching}] \hfill \\
18 |     If two logs contain an entry with
19 |     the same index and term, then the logs are identical in all entries
20 |     up through the given index. \S\ref{basicraft:logreplication}
21 | 
22 |     \item[\textbf{Leader Completeness}] \hfill \\
23 |     If a log entry is committed
24 |     in a given term, then that entry will be present in the logs of
25 |     the leaders for all higher-numbered terms.
26 |     \S\ref{basicraft:safety}
27 | 
28 |     \item[\textbf{State Machine Safety}] \hfill \\
29 |     If a server has applied a
30 |     log entry at a given index to its state machine, no other server
31 |     will ever apply a different log entry for the same index.
32 |     \S\ref{basicraft:safety:argument}
33 |     \vspace{-0.5ex}
34 |     \end{description}
35 |   }
36 | }
37 | \vcaption[key properties]{
38 | \name{} guarantees that each of these properties is true at all times. The
39 | section numbers indicate where each property is discussed.
40 | }
41 | \label{fig:basicraft:properties}
42 | \end{figure}
43 | 


--------------------------------------------------------------------------------
/basicraft/intro.tex:
--------------------------------------------------------------------------------
1 | This chapter presents the Raft algorithm. We designed Raft to be
2 | as understandable as possible; the first section describes our approach
3 | to designing for understandability. The following sections describe the
4 | algorithm itself and include examples of design choices we made for
5 | understandability.
6 | 


--------------------------------------------------------------------------------
/basicraft/raft.tex:
--------------------------------------------------------------------------------
1 | \chapter{Basic Raft algorithm}
2 | \label{basicraft}
3 | 
4 | \input{basicraft/intro}
5 | \input{basicraft/understandability}
6 | \input{basicraft/consensus}
7 | \input{basicraft/conclusion}
8 | 


--------------------------------------------------------------------------------
/basicraft/understandability.tex:
--------------------------------------------------------------------------------
 1 | \section{Designing for understandability}
 2 | \label{basicraft:understandability}
 3 | 
 4 | We had several goals in designing \name{}: it must provide a complete
 5 | and practical foundation for system building, so that it
 6 | significantly reduces the amount of design work required of developers;
 7 | it must be safe under all conditions and available under
 8 | typical operating conditions; and it must be efficient for
 9 | common operations. But our most important goal---and most difficult
10 | challenge---was \emph{understandability}. It must be possible
11 | for a large audience to understand the algorithm comfortably.
12 | In addition, it must be possible to develop intuitions
13 | about the algorithm, so that system builders can make the
14 | extensions that are inevitable in real-world implementations.
15 | 
16 | There were numerous points in the design of \name{} where we had to
17 | choose among alternative approaches. In these situations we evaluated
18 | the alternatives based on understandability: how hard is it to explain
19 | each alternative (for example, how complex is its state space, and
20 | does it have subtle implications?), and how easy will it be for a reader
21 | to completely understand the approach and its implications?
22 | 
23 | We recognize that there is a high degree of subjectivity in such
24 | analysis; nonetheless, we used two techniques that
25 | are generally applicable. The first technique is the well-known approach
26 | of problem decomposition: wherever possible, we divided problems
27 | into separate pieces that could be solved, explained, and understood
28 | relatively independently. For example, in \name{} we separated
29 | leader election, log replication, and safety.
30 | 
31 | Our second approach was to simplify the state space by reducing the
32 | number of states to consider, making the system
33 | more coherent and eliminating nondeterminism where possible.
34 | Specifically, logs are not allowed to have holes, and Raft limits the
35 | ways in which logs can become inconsistent with each other.
36 | Although in most cases we tried to eliminate nondeterminism, there are
37 | some situations where nondeterminism actually improves understandability.
38 | In particular, randomized approaches introduce nondeterminism, but
39 | they tend to reduce the state space by handling all possible choices
40 | in a similar fashion (``choose any; it doesn't matter''). We used
41 | randomization to simplify the Raft leader election algorithm.
42 | 


--------------------------------------------------------------------------------
/book.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongardie/dissertation/0c10ea1b53d73bf6d163bdde8e4141a1cbd8f36a/book.pdf


--------------------------------------------------------------------------------
/cc-by.svg:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 | <!-- Created with Inkscape (http://www.inkscape.org/) -->
  3 | <svg
  4 |    xmlns:dc="http://purl.org/dc/elements/1.1/"
  5 |    xmlns:cc="http://web.resource.org/cc/"
  6 |    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  7 |    xmlns:svg="http://www.w3.org/2000/svg"
  8 |    xmlns="http://www.w3.org/2000/svg"
  9 |    xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
 10 |    xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
 11 |    width="120"
 12 |    height="42"
 13 |    id="svg2759"
 14 |    sodipodi:version="0.32"
 15 |    inkscape:version="0.45+devel"
 16 |    version="1.0"
 17 |    sodipodi:docname="by.svg"
 18 |    inkscape:output_extension="org.inkscape.output.svg.inkscape">
 19 |   <defs
 20 |      id="defs2761" />
 21 |   <sodipodi:namedview
 22 |      id="base"
 23 |      pagecolor="#ffffff"
 24 |      bordercolor="#8b8b8b"
 25 |      borderopacity="1"
 26 |      gridtolerance="10000"
 27 |      guidetolerance="10"
 28 |      objecttolerance="10"
 29 |      inkscape:pageopacity="0.0"
 30 |      inkscape:pageshadow="2"
 31 |      inkscape:zoom="1"
 32 |      inkscape:cx="179"
 33 |      inkscape:cy="89.569904"
 34 |      inkscape:document-units="px"
 35 |      inkscape:current-layer="layer1"
 36 |      width="120px"
 37 |      height="42px"
 38 |      inkscape:showpageshadow="false"
 39 |      inkscape:window-width="1198"
 40 |      inkscape:window-height="624"
 41 |      inkscape:window-x="396"
 42 |      inkscape:window-y="242" />
 43 |   <metadata
 44 |      id="metadata2764">
 45 |     <rdf:RDF>
 46 |       <cc:Work
 47 |          rdf:about="">
 48 |         <dc:format>image/svg+xml</dc:format>
 49 |         <dc:type
 50 |            rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
 51 |       </cc:Work>
 52 |     </rdf:RDF>
 53 |   </metadata>
 54 |   <g
 55 |      inkscape:label="Layer 1"
 56 |      inkscape:groupmode="layer"
 57 |      id="layer1">
 58 |     <g
 59 |        transform="matrix(0.9937728,0,0,0.9936696,-177.69267,6.25128e-7)"
 60 |        id="g260"
 61 |        inkscape:export-filename="/mnt/hgfs/Bov/Documents/Work/2007/cc/identity/srr buttons/big/by.png"
 62 |        inkscape:export-xdpi="300.23013"
 63 |        inkscape:export-ydpi="300.23013">
 64 | 	<path
 65 |    id="path3817_1_"
 66 |    nodetypes="ccccccc"
 67 |    d="M 181.96579,0.51074 L 296.02975,0.71338 C 297.6235,0.71338 299.04733,0.47705 299.04733,3.89404 L 298.90768,41.46093 L 179.08737,41.46093 L 179.08737,3.75439 C 179.08737,2.06934 179.25046,0.51074 181.96579,0.51074 z"
 68 |    style="fill:#aab2ab" />
 69 | 
 70 | 	<path
 71 |    d="M 297.29636,0 L 181.06736,0 C 179.82078,0 178.80613,1.01416 178.80613,2.26074 L 178.80613,41.75732 C 178.80613,42.03906 179.03513,42.26757 179.31687,42.26757 L 299.04734,42.26757 C 299.32908,42.26757 299.55808,42.03905 299.55808,41.75732 L 299.55808,2.26074 C 299.55807,1.01416 298.54343,0 297.29636,0 z M 181.06735,1.02148 L 297.29635,1.02148 C 297.98043,1.02148 298.53658,1.57714 298.53658,2.26074 C 298.53658,2.26074 298.53658,18.20898 298.53658,29.71045 L 215.19234,29.71045 C 212.14742,35.21631 206.28121,38.95459 199.54879,38.95459 C 192.81344,38.95459 186.94869,35.21973 183.90524,29.71045 L 179.8276,29.71045 C 179.8276,18.20899 179.8276,2.26074 179.8276,2.26074 C 179.82761,1.57715 180.38376,1.02148 181.06735,1.02148 z"
 72 |    id="path263" />
 73 | 
 74 | 	<g
 75 |    enable-background="new    "
 76 |    id="g265">
 77 | 		<path
 78 |    d="M 253.07761,32.95605 C 253.39499,32.95605 253.68503,32.98437 253.94773,33.04003 C 254.20945,33.09569 254.43308,33.18749 254.62058,33.31542 C 254.8071,33.44237 254.95261,33.6123 255.05515,33.82323 C 255.15769,34.03514 255.20945,34.29589 255.20945,34.60741 C 255.20945,34.94335 255.13328,35.22264 254.97996,35.44628 C 254.82762,35.67089 254.60105,35.85351 254.30223,35.99706 C 254.71434,36.11522 255.02196,36.32226 255.22508,36.61815 C 255.4282,36.91404 255.52977,37.27049 255.52977,37.68749 C 255.52977,38.02343 255.46434,38.31444 255.33348,38.56054 C 255.20262,38.80566 255.02586,39.00683 254.80516,39.1621 C 254.58348,39.31835 254.33055,39.43358 254.04735,39.5078 C 253.76317,39.583 253.47215,39.6201 253.17235,39.6201 L 249.936,39.6201 L 249.936,32.95604 L 253.07761,32.95604 L 253.07761,32.95605 z M 252.89011,35.65137 C 253.15183,35.65137 253.36667,35.58887 253.53562,35.46485 C 253.70359,35.34083 253.78757,35.13965 253.78757,34.86036 C 253.78757,34.70509 253.75925,34.57716 253.70359,34.47852 C 253.64695,34.37891 253.57273,34.30176 253.47898,34.24512 C 253.38523,34.18946 253.27781,34.15039 253.15671,34.12891 C 253.03561,34.10743 252.90866,34.09668 252.77878,34.09668 L 251.40476,34.09668 L 251.40476,35.65137 L 252.89011,35.65137 z M 252.97604,38.47949 C 253.11959,38.47949 253.25631,38.46582 253.38717,38.4375 C 253.51803,38.40918 253.63326,38.3623 253.73385,38.29785 C 253.83346,38.23242 253.91256,38.14355 253.97213,38.03125 C 254.0317,37.91992 254.061,37.77637 254.061,37.60254 C 254.061,37.26074 253.96432,37.0166 253.77096,36.87012 C 253.5776,36.72461 253.32174,36.65137 253.00436,36.65137 L 251.40475,36.65137 L 251.40475,38.47949 L 252.97604,38.47949 z"
 79 |    id="path267"
 80 |    style="fill:#ffffff" />
 81 | 
 82 | 		<path
 83 |    d="M 255.78854,32.95605 L 257.43209,32.95605 L 258.99264,35.58789 L 260.54342,32.95605 L 262.17721,32.95605 L 259.70358,37.0625 L 259.70358,39.62012 L 258.23483,39.62012 L 258.23483,37.02539 L 255.78854,32.95605 z"
 84 |    id="path269"
 85 |    style="fill:#ffffff" />
 86 | 
 87 | 	</g>
 88 | 
 89 | 	<g
 90 |    id="g5908_1_"
 91 |    transform="matrix(0.872921,0,0,0.872921,50.12536,143.2144)">
 92 | 		
 93 | 			<path
 94 |    id="path5906_1_"
 95 |    cx="296.35416"
 96 |    ry="22.939548"
 97 |    cy="264.3577"
 98 |    type="arc"
 99 |    rx="22.939548"
100 |    d="M 186.90065,-141.46002 C 186.90623,-132.77923 179.87279,-125.73852 171.19257,-125.73291 C 162.51235,-125.72736 155.47051,-132.76025 155.46547,-141.44098 C 155.46547,-141.44714 155.46547,-141.45331 155.46547,-141.46002 C 155.46043,-150.14081 162.49333,-157.18152 171.17355,-157.18658 C 179.8549,-157.19213 186.89561,-150.15924 186.90065,-141.47845 C 186.90065,-141.4729 186.90065,-141.46619 186.90065,-141.46002 z"
101 |    style="fill:#ffffff" />
102 | 
103 | 		<g
104 |    id="g5706_1_"
105 |    transform="translate(-289.6157,99.0653)">
106 | 			<path
107 |    id="path5708_1_"
108 |    d="M 473.57574,-253.32751 C 477.06115,-249.8421 478.80413,-245.5736 478.80413,-240.52532 C 478.80413,-235.47594 477.09136,-231.25329 473.66582,-227.85741 C 470.03051,-224.28081 465.734,-222.49309 460.77635,-222.49309 C 455.87858,-222.49309 451.65648,-224.26628 448.11122,-227.81261 C 444.56541,-231.35845 442.79277,-235.59563 442.79277,-240.52532 C 442.79277,-245.45391 444.56541,-249.7213 448.11122,-253.32751 C 451.56642,-256.81402 455.7885,-258.557 460.77635,-258.557 C 465.82465,-258.55701 470.09039,-256.81403 473.57574,-253.32751 z M 450.45776,-250.98267 C 447.51104,-248.00629 446.03823,-244.51978 446.03823,-240.52033 C 446.03823,-236.52198 447.49651,-233.06507 450.41247,-230.14966 C 453.32897,-227.23316 456.80096,-225.77545 460.82952,-225.77545 C 464.85808,-225.77545 468.35967,-227.24768 471.33605,-230.19385 C 474.16198,-232.9303 475.57549,-236.37091 475.57549,-240.52033 C 475.57549,-244.63837 474.13903,-248.13379 471.26781,-251.00501 C 468.39714,-253.87568 464.9179,-255.31159 460.82952,-255.31159 C 456.74112,-255.31158 453.28314,-253.86841 450.45776,-250.98267 z M 458.21225,-242.27948 C 457.76196,-243.26117 457.08795,-243.75232 456.18903,-243.75232 C 454.59986,-243.75232 453.80558,-242.68225 453.80558,-240.54321 C 453.80558,-238.40368 454.59986,-237.33471 456.18903,-237.33471 C 457.23841,-237.33471 457.98795,-237.85546 458.43769,-238.89922 L 460.64045,-237.72625 C 459.59052,-235.86077 458.01536,-234.92779 455.91496,-234.92779 C 454.29506,-234.92779 452.99733,-235.42449 452.0229,-236.4168 C 451.0468,-237.41021 450.56016,-238.77953 450.56016,-240.52532 C 450.56016,-242.24035 451.06245,-243.60186 452.06764,-244.61034 C 453.07283,-245.61888 454.32466,-246.12291 455.82545,-246.12291 C 458.04557,-246.12291 459.63526,-245.24803 460.59626,-243.50005 L 458.21225,-242.27948 z M 468.57562,-242.27948 C 468.12475,-243.26117 467.46417,-243.75232 466.5932,-243.75232 C 464.97217,-243.75232 464.16107,-242.68225 464.16107,-240.54321 C 464.16107,-238.40368 464.97217,-237.33471 466.5932,-237.33471 C 467.64429,-237.33471 468.38037,-237.85546 468.80048,-238.89922 L 471.05249,-237.72625 C 470.00421,-235.86077 468.43127,-234.92779 466.33478,-234.92779 C 464.7171,-234.92779 463.42218,-235.42449 462.44831,-236.4168 C 461.47614,-237.41021 460.98896,-238.77953 460.98896,-240.52532 C 460.98896,-242.24035 461.48341,-243.60186 462.47181,-244.61034 C 463.45966,-245.61888 464.71711,-246.12291 466.24531,-246.12291 C 468.4615,-246.12291 470.04896,-245.24803 471.0066,-243.50005 L 468.57562,-242.27948 z" />
109 | 
110 | 		</g>
111 | 
112 | 	</g>
113 | 
114 | 	<g
115 |    id="g275">
116 | 		<circle
117 |    cx="255.55124"
118 |    cy="15.31348"
119 |    r="10.80664"
120 |    id="circle277"
121 |    sodipodi:cx="255.55124"
122 |    sodipodi:cy="15.31348"
123 |    sodipodi:rx="10.80664"
124 |    sodipodi:ry="10.80664"
125 |    style="fill:#ffffff" />
126 | 
127 | 		<g
128 |    id="g279">
129 | 			<path
130 |    d="M 258.67819,12.18701 C 258.67819,11.77051 258.3403,11.4331 257.92526,11.4331 L 253.15182,11.4331 C 252.73678,11.4331 252.39889,11.7705 252.39889,12.18701 L 252.39889,16.95996 L 253.72994,16.95996 L 253.72994,22.61182 L 257.34713,22.61182 L 257.34713,16.95996 L 258.67818,16.95996 L 258.67818,12.18701 L 258.67819,12.18701 z"
131 |    id="path281" />
132 | 
133 | 			<circle
134 |    cx="255.53854"
135 |    cy="9.1723604"
136 |    r="1.63281"
137 |    id="circle283"
138 |    sodipodi:cx="255.53854"
139 |    sodipodi:cy="9.1723604"
140 |    sodipodi:rx="1.63281"
141 |    sodipodi:ry="1.63281" />
142 | 
143 | 		</g>
144 | 
145 | 		<path
146 |    clip-rule="evenodd"
147 |    d="M 255.5239,3.40723 C 252.29148,3.40723 249.55515,4.53516 247.31589,6.79102 C 245.01804,9.12452 243.8696,11.88672 243.8696,15.07569 C 243.8696,18.26466 245.01804,21.00733 247.31589,23.30225 C 249.61374,25.59668 252.35007,26.74414 255.5239,26.74414 C 258.73679,26.74414 261.52195,25.58789 263.87742,23.27295 C 266.09715,21.07568 267.2075,18.34326 267.2075,15.07568 C 267.2075,11.8081 266.07762,9.04687 263.8198,6.79101 C 261.56003,4.53516 258.79538,3.40723 255.5239,3.40723 z M 255.55319,5.50684 C 258.20163,5.50684 260.45065,6.44092 262.30026,8.30811 C 264.1694,10.15528 265.10397,12.41114 265.10397,15.07569 C 265.10397,17.75928 264.18893,19.98633 262.35885,21.75587 C 260.43014,23.66212 258.16256,24.61476 255.55319,24.61476 C 252.94284,24.61476 250.69381,23.67189 248.80612,21.78517 C 246.91647,19.89845 245.97311,17.66212 245.97311,15.0757 C 245.97311,12.48879 246.92721,10.23341 248.83541,8.30812 C 250.6655,6.44092 252.90475,5.50684 255.55319,5.50684 z"
148 |    id="path285"
149 |    style="fill-rule:evenodd" />
150 | 
151 | 	</g>
152 | 
153 | </g>
154 |   </g>
155 | </svg>
156 | 


--------------------------------------------------------------------------------
/clients/cheatsheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongardie/dissertation/0c10ea1b53d73bf6d163bdde8e4141a1cbd8f36a/clients/cheatsheet.pdf


--------------------------------------------------------------------------------
/compaction/Makefrag:
--------------------------------------------------------------------------------
 1 | FIGGENPDF := $(FIGGENPDF) \
 2 |   compaction/cleaningdirect.pdf \
 3 |   compaction/cleaningtwologs.pdf
 4 | 
 5 | 
 6 | compaction/cleaningdirect.pdf: compaction/cleaning.svg
 7 | 	inkscape -T -z -i direct -A $@ $<
 8 | compaction/cleaningtwologs.pdf: compaction/cleaning.svg
 9 | 	inkscape -T -z -i twologs -A $@ $<
10 | 
11 | 


--------------------------------------------------------------------------------
/compaction/cheatsheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongardie/dissertation/0c10ea1b53d73bf6d163bdde8e4141a1cbd8f36a/compaction/cheatsheet.pdf


--------------------------------------------------------------------------------
/compaction/compaction.tex:
--------------------------------------------------------------------------------
 1 | \chapter{Log compaction}
 2 | \label{compaction}
 3 | 
 4 | \input{compaction/intro}
 5 | \input{compaction/memsnapshot}
 6 | \input{compaction/disksnapshot}
 7 | \input{compaction/incremental}
 8 | \input{compaction/leader}
 9 | \input{compaction/conclusion}
10 | 


--------------------------------------------------------------------------------
/compaction/conclusion.tex:
--------------------------------------------------------------------------------
 1 | \section{Conclusion}
 2 | \label{compaction:conclusion}
 3 | 
 4 | This chapter discussed several approaches to log compaction in Raft,
 5 | which are summarized in Figure~\ref{fig:compaction:rules}. Different
 6 | approaches are suitable for different systems, depending on the size of
 7 | the state machine, the level of performance required, and the amount of
 8 | complexity budgeted.
 9 | Raft supports a wide variety of approaches that share a common
10 | conceptual framework:
11 | %
12 | \begin{itemize}
13 | %
14 | \item Each server compacts the committed prefix of its log
15 | independently.
16 | %
17 | \item The basic interaction between the state machine and Raft involves
18 | transferring responsibility for a prefix of the log from Raft to the
19 | state machine. Once the state machine has applied commands to disk, it
20 | instructs Raft to discard the corresponding prefix of the log. Raft
21 | retains the index and term of the last entry it discarded, along with
22 | the latest configuration as of that index.
23 | %
24 | \item Once Raft has discarded a prefix of the log, the state machine
25 | takes on two new responsibilities: loading the state on a restart and
26 | providing a consistent image to transfer to a slow follower.
27 | %
28 | \end{itemize}
29 | 
30 | Snapshotting for memory-based state machines is used successfully in
31 | several production systems, including Chubby and ZooKeeper,
32 | and we have implemented this approach in LogCabin. Although
33 | operating on an in-memory data structure is fast for most operations,
34 | performance during the snapshotting process may be significantly
35 | impacted. Snapshotting
36 | concurrently helps to hide the resource usage, and in the future,
37 | scheduling servers across the cluster to snapshot at different times
38 | might keep snapshotting from affecting clients at all.
39 | 
40 | Disk-based state machines that mutate their state in place are
41 | conceptually simple. They still require copy-on-write for transferring a
42 | consistent disk image to other servers, but this may be a small burden
43 | with disks, which naturally split into blocks. However, random disk writes
44 | during normal operation tend to be slow, so this approach will limit the
45 | system's write throughput.
46 | 
47 | Ultimately, incremental approaches can be the most efficient form of
48 | compaction. By
49 | operating on small pieces of the state at a time, they can limit bursts
50 | in resource usage (and they can also compact concurrently). They can
51 | also avoid writing the same data out to disk repeatedly; stable
52 | data should make its way to a region of disk that does not get
53 | compacted often. While implementing incremental compaction can be complex, this
54 | complexity can be offloaded to a library such as LevelDB. Moreover, by
55 | keeping data structures in memory and caching more of the disk in
56 | memory, the performance for client operations with incremental
57 | compaction can approach that of memory-based state machines.
58 | 


--------------------------------------------------------------------------------
/compaction/disksnapshot.tex:
--------------------------------------------------------------------------------
 1 | \section{Snapshotting disk-based state machines}
 2 | \label{compaction:disksnapshot}
 3 | 
 4 | This section discusses a snapshotting approach for large state machines
 5 | (on the order of tens or hundreds of gigabytes) that use disk as their
 6 | primary location of record. 
 7 | These state machines behave differently in that they always have a copy
 8 | of the state ready on disk in case of a crash.
 9 | Applying each entry from the Raft log mutates
10 | the on-disk state and effectively arrives at a new snapshot. Thus, once
11 | an entry is applied, it can be discarded from the Raft log.
12 | (State machines can also buffer writes in memory in hopes of achieving
13 | better disk efficiency; once they are written to disk, the corresponding
14 | entries can be discarded from the Raft log.)
15 | 
16 | The main problem with disk-based state machines is that
17 | mutating state on disk can lead to poor performance. Without write
18 | buffering, it requires one or more random disk writes for every command
19 | applied, which can limit the system's overall write throughput (and
20 | write buffering might not help much).
21 | Section~\ref{compaction:incremental} discusses incremental approaches to
22 | log compaction which write to disk more efficiently with large,
23 | sequential writes.
24 | 
25 | Disk-based state machines must be able to provide a consistent
26 | snapshot of the disk for the purpose of transmitting it to slow
27 | followers. Although they always have a snapshot on disk, they are
28 | continuously modifying it. Thus, they still require copy-on-write
29 | techniques to retain a consistent snapshot for a long enough period to
30 | transmit it. Fortunately, disk formats are almost always divided into
31 | logical blocks, so implementing copy-on-write in the state machine
32 | should be straightforward. Disk-based state machines can also rely on
33 | operating system support for their snapshots. For example, LVM (logical
34 | volume management) on Linux can be used to create snapshots of entire
35 | disk partitions~\cite{lvm}, and some recent file systems allow
36 | snapshotting individual directories~\cite{btrfssnapshots}.
37 | 
38 | Copying a snapshot of a disk image can take a long time, and as
39 | modifications to the disk accumulate, so does the extra disk usage
40 | required to retain the snapshot.
41 | Although we haven't implemented disk-based snapshotting,
42 | we speculate that disk-based state
43 | machines could avoid most of this overhead by transmitting their disk
44 | contents with the following algorithm:
45 | %
46 | \begin{enumerate}
47 | %
48 | \item For each disk block, track the time it was last modified.
49 | %
50 | \item While continuing normal operation, transmit the entire disk
51 | contents to a follower block by block. During this process, no extra
52 | disk space is used on the leader. Since blocks are being modified
53 | concurrently, this is likely to result in an inconsistent disk image on
54 | the follower. As each block is transferred from the leader, note its
55 | last modification time.
56 | %
57 | \item Take a copy-on-write snapshot of the disk contents. Once this is
58 | taken, the leader has a consistent copy of its disk contents, but
59 | additional disk space is used as modifications to the disk occur due to
60 | continued client operations.
61 | %
62 | \item Retransmit only the disk blocks that were modified between when
63 | they were first transmitted in Step~2 and when the snapshot was taken in
64 | Step~3.
65 | %
66 | \end{enumerate}
67 | %
68 | Hopefully, most of the blocks of the consistent snapshot will have
69 | already been transmitted by the time it is created in Step~3. If that is
70 | the case, the transfer in Step~4 will proceed quickly: the additional
71 | disk capacity used to retain the snapshot on the leader during Step~4
72 | will be low, and the additional network bandwidth used during Step~4 to
73 | retransmit modified blocks will also be low.
74 | 
75 | 
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/compaction/incremental.tex:
--------------------------------------------------------------------------------
  1 | \section{Incremental cleaning approaches}
  2 | \label{compaction:incremental}
  3 | 
  4 | Incremental approaches to compaction, such as log
  5 | cleaning~\cite{Rosenblum:1992,Rumble:2014} and log-structured merge
  6 | trees~\cite{ONeil:1996,Chang:2006} (LSM trees), are also possible.
  7 | Although they are more complex than snapshotting, incremental
  8 | approaches have several desirable features:
  9 | %
 10 | \begin{itemize}
 11 | %
 12 | \item They operate on only a fraction of the data at once, so they
 13 | spread the load of compaction evenly over time.
 14 | %
 15 | \item They write to disk efficiently, both in normal operation and
 16 | while compacting. They use large, sequential writes in both cases.
 17 | Incremental approaches also selectively compact parts of the disk
 18 | with the most reclaimable space, so they write less data to disk than
 19 | snapshotting for memory-based state machines (which rewrites all of disk
 20 | on every snapshot).
 21 | %
 22 | \item They can transfer consistent state snapshots fairly easily
 23 | because they do not modify regions of disk in place.
 24 | %
 25 | \end{itemize}
 26 | %
 27 | Section~\ref{compaction:incremental:cleaning} and
 28 | Section~\ref{compaction:incremental:lsmtrees}
 29 | first describe the basics of log cleaning and LSM trees in
 30 | general. Then, Section~\ref{compaction:incremental:raft}
 31 | discusses how they could be applied to Raft.
 32 | 
 33 | \subsection{Basics of log cleaning}
 34 | \label{compaction:incremental:cleaning}
 35 | 
 36 | Log cleaning was introduced in the context of log-structured file
 37 | systems~\cite{Rosenblum:1992} and has recently been proposed for
 38 | in-memory storage systems such as RAMCloud~\cite{Rumble:2014}.
 39 | In principle, log cleaning can be used for any type of data structure,
 40 | though some would be harder to implement efficiently than others.
 41 | 
 42 | Log cleaning maintains the log as the place of record for the system's
 43 | state. The layout is optimized for sequential writing, and it makes read
 44 | operations effectively random access. Thus, indexing structures are
 45 | needed to locate data items to read.
 46 | 
 47 | In log cleaning, the log is split into consecutive regions called
 48 | \emph{segments}. Each pass of the log cleaner compacts the log using a
 49 | three-step algorithm:
 50 | %
 51 | \begin{enumerate}
 52 | %
 53 | \item It first selects segments to clean that have accumulated a large
 54 | fraction of obsolete entries.
 55 | %
 56 | \item It then copies the \emph{live} entries (those that contribute to
 57 | the current system state) from those segments to the head of the log.
 58 | %
 59 | \item Finally, it frees the storage space for the segments, making
 60 | that space available for new segments.
 61 | %
 62 | \end{enumerate}
 63 | %
 64 | To minimize the effect on normal operation, this process can be done
 65 | concurrently~\cite{Rumble:2014}.
 66 | 
 67 | As a result of copying the live entries forwards to the head of the log,
 68 | the entries get to be out of order for replay. The entries can include
 69 | additional information (e.g., version numbers) to recreate the correct
 70 | ordering when the log is applied.
 71 | 
 72 | The policy of which segments are selected for cleaning has a big impact
 73 | on performance; prior work proposes a cost-benefit policy that factors
 74 | in not only the amount of space utilized by live entries but also how
 75 | long those entries are likely to remain
 76 | live~\cite{Rosenblum:1992,Rumble:2014}.
 77 | 
 78 | Determining whether entries are live is the state machine's
 79 | responsibility. For example, in a key-value store, a log entry to set a
 80 | key to a particular value is live if the key exists and is currently set
 81 | to the given value. Determining whether a log entry that deletes a key
 82 | is live is more subtle: it is live as long as any prior entries setting that key
 83 | are present in the log. RAMCloud
 84 | preserves deletion commands (called tombstones) as
 85 | necessary~\cite{Rumble:2014}, but another approach is to periodically write
 86 | out a summary of the keys that
 87 | \emph{are} present in the current state, then all log entries regarding
 88 | keys not listed are not live. Key-value stores are a fairly simple
 89 | example; other state machines are possible, but unfortunately,
 90 | determining liveness will be different for each.
 91 | 
 92 | \subsection{Basics of log-structured merge trees}
 93 | \label{compaction:incremental:lsmtrees}
 94 | 
 95 | 
 96 | Log-structured merge trees (LSM trees) were first described by
 97 | O'Neil~\cite{ONeil:1996} and were later popularized in distributed
 98 | systems by BigTable~\cite{Chang:2006}. They are used in systems such as
 99 | Apache Cassandra~\cite{Cassandra} and HyperDex~\cite{Escriva:2012} and
100 | are available as libraries such as LevelDB~\cite{leveldb} and its forks
101 | (e.g., RocksDB~\cite{rocksdb} and HyperLevelDB~\cite{hyperleveldb}).
102 | 
103 | LSM trees are tree-like data structures that store ordered key-value
104 | pairs. At a high level, they use disk similarly to log cleaning
105 | approaches: they write in large sequential strides and do not modify
106 | data on disk in place. However, instead of maintaining all state in the
107 | log, LSM trees reorganize the state for better random access.
108 | 
109 | A typical LSM tree keeps recently written keys in a small log on disk.
110 | When the log reaches a fixed size, it is sorted by key and written to
111 | a file called a \emph{run} in sorted order.
112 | Runs are never modified in place, but a compaction process periodically
113 | merges multiple runs together, producing new runs and discarding the old
114 | ones. The merge is reminiscent of merge sort; when a key is in multiple
115 | input runs, only the latest version is kept, so the produced runs are
116 | more compact. The compaction strategy used in LevelDB is summarized in
117 | Figure~\ref{fig:compaction:rules}; it segregates runs by age for
118 | efficiency (similar to log cleaning).
119 | 
120 | During normal operation, the state machine can operate on this data
121 | directly. To read a key, it first checks to see if that key was modified
122 | recently in its log, then checks each run. To avoid checking every run
123 | for a key on every lookup, some systems create a bloom filter for each
124 | run (a compact data structure which can say with certainty in some cases
125 | that a key does not appear in a run, though it may sometimes require
126 | searching a run even when a key is not present).
127 | 
128 | \subsection{Log cleaning and log-structured merge trees in Raft}
129 | \label{compaction:incremental:raft}
130 | 
131 | We have not attempted to implement log cleaning or LSM trees in Raft,
132 | but we speculate that both would work well. Applying LSM trees to Raft
133 | appears to be fairly straightforward. Because the Raft log already
134 | stores recent entries durably on disk, the LSM tree can keep recent data
135 | in a more convenient tree format in memory. This would be fast for servicing
136 | lookups, and when the Raft log reached a fixed size, the tree would already
137 | be in sorted order to write to disk as a new run. Transferring the state
138 | from the leader to a slow follower requires sending all the runs to the
139 | follower (but not the in-memory tree); fortunately, runs are immutable,
140 | so there is no concern of the runs being modified during the transfer.
141 | 
142 | \begin{figure}
143 | \centering
144 | 
145 | \begin{subfigure}{\textwidth}
146 | \centering
147 | \includegraphics[scale=.45]{compaction/cleaningdirect}
148 | \caption{
149 | Cleaning the Raft log directly would lead to many holes, which would
150 | add significant complexity to Raft and its interaction with the state
151 | machine.
152 | }
153 | \label{fig:compaction:incremental:cleaningdirect}
154 | \end{subfigure}
155 | 
156 | \vspace{2ex}
157 | 
158 | \begin{subfigure}{\textwidth}
159 | \centering
160 | \includegraphics[scale=.45]{compaction/cleaningtwologs}
161 | \caption{
162 | The state machine could instead structure its own data as a log and
163 | clean that log independently, without involving Raft.
164 | }
165 | \label{fig:compaction:incremental:cleaningtwologs}
166 | \end{subfigure}
167 | 
168 | \vcaption[approaches to log cleaning in Raft]{
169 | Two possible approaches to log cleaning in Raft.
170 | }
171 | \end{figure}
172 | 
173 | Applying log cleaning to Raft is less obvious. We first considered an
174 | approach in which the Raft log was divided into segments and cleaned
175 | (see Figure~\ref{fig:compaction:incremental:cleaningdirect}).
176 | Unfortunately, cleaning would place a lot of holes in the log where
177 | segments were freed, which would require a modified approach to log
178 | replication. We think this approach could be made to work, but it adds
179 | significant complexity to Raft and its interaction with the state
180 | machine. Moreover, since only the leader can append to the Raft log,
181 | cleaning would need to be leader-based, which would waste the leader's
182 | network bandwidth (this is discussed further in
183 | Section~\ref{compaction:leader}).
184 | 
185 | 
186 | A better approach would be to handle log cleaning similarly to LSM
187 | trees: Raft would keep a contiguous log for recent changes, and the
188 | state machine would keep its own state as a log, but these logs would be
189 | logically distinct (see
190 | Figure~\ref{fig:compaction:incremental:cleaningtwologs}). When the Raft
191 | log grew to a fixed size, its new entries would be written as a new
192 | segment in the state machine's log, and the corresponding prefix of the
193 | Raft log would be discarded. Segments in the state machine would be
194 | cleaned independently on each server, and the Raft log would remain
195 | entirely unaffected by this. We prefer this approach over cleaning the
196 | Raft log directly, since the complexity of log cleaning is encapsulated
197 | entirely in the state machine (the interface between the state machine
198 | and Raft remains simple), and servers can clean independently.
199 | 
200 | As described, this approach would require the state machine to write all of
201 | Raft's log entries into its own log (though it could do so in large
202 | batches). This additional copy could be optimized away by directly
203 | moving a file consisting of log entries from Raft's log and
204 | incorporating that file into the state machine's data structures.
205 | This could be a helpful optimization for performance-critical systems, but
206 | unfortunately, it would more tightly couple the state machine and the
207 | Raft module, since the state machine would need to understand the
208 | on-disk representation of the Raft log.
209 | 


--------------------------------------------------------------------------------
/compaction/intro.tex:
--------------------------------------------------------------------------------
  1 | 
  2 | Raft's log grows during normal operation as it incorporates more client
  3 | requests. As it grows larger, it occupies more space and takes more time
  4 | to replay. Without some way to compact the log, this will eventually
  5 | cause availability problems: servers will either run out of space, or
  6 | they will take too long to start. Thus, some form of log compaction is
  7 | necessary for any practical system.
  8 | 
  9 | The general idea of log compaction is that much of the information in
 10 | the log becomes obsolete over time and can be discarded. For example, an
 11 | operation that sets $x$ to 2 is obsolete if a later operation sets $x$
 12 | to 3. Once log entries have been committed and applied to the
 13 | state machine, the intermediate states and operations used to arrive at
 14 | the current state are no longer needed, and they can be compacted away.
 15 | 
 16 | Unlike the core Raft algorithm and membership changes, different systems
 17 | will have different needs when it comes to log compaction. There is no
 18 | one-size-fits-all solution to log compaction for a couple of reasons.
 19 | First, different systems may choose to trade off simplicity and
 20 | performance to varying degrees. Second, the state machine must be
 21 | intimately involved in log compaction, and state machines differ
 22 | substantially in size and in whether they are based on disk or volatile memory.
 23 | 
 24 | \begin{figure*}
 25 | \centering
 26 | \includegraphics[scale=0.95]{compaction/rules}
 27 | \vcaption[summary of approaches]{
 28 | The figure shows how various approaches to log compaction can be used in
 29 | Raft. Details for log-structured merge trees in the figure are based
 30 | on LevelDB~\cite{leveldb:compactions}, and details for log cleaning are
 31 | based on RAMCloud~\cite{Rumble:2014}; rules for managing deletions are
 32 | omitted.
 33 | }
 34 | \label{fig:compaction:rules}
 35 | \end{figure*}
 36 | 
 37 | The goal of this chapter is to discuss a variety of approaches to log
 38 | compaction. In each approach, most of the responsibility of log
 39 | compaction falls on the state machine, which is in charge of writing the
 40 | state to disk and compacting the state. State machines can achieve this
 41 | in different ways, which are described throughout the chapter and
 42 | summarized in Figure~\ref{fig:compaction:rules}:
 43 | %
 44 | \begin{itemize}
 45 | %
 46 | \item
 47 | %
 48 | Snapshotting for memory-based state machines is conceptually the
 49 | simplest approach. In snapshotting, the entire current system state is
 50 | written to a \emph{snapshot} on stable storage, then the entire log up
 51 | to that point is discarded. Snapshotting is used in
 52 | Chubby~\cite{Burrows:2006, Chandra:2007} and ZooKeeper~\cite{Hunt:2010},
 53 | and we have implemented snapshotting in LogCabin. Snapshotting is the
 54 | approach presented in the most depth in this chapter,
 55 | in Section~\ref{compaction:memsnapshot}.
 56 | %
 57 | \item
 58 | %
 59 | With disk-based state machines, a recent copy of the system state is
 60 | maintained on disk as part of normal operation. Thus, the Raft log can
 61 | be discarded as soon as the state machine reflects writes to disk,
 62 | and snapshotting is used only when sending consistent disk images to
 63 | other servers
 64 | (Section~\ref{compaction:disksnapshot}).
 65 | %
 66 | \item
 67 | %
 68 | Incremental approaches to log compaction, such as log cleaning and
 69 | log-structured merge trees, are presented in
 70 | Section~\ref{compaction:incremental}. These approaches write to disk
 71 | efficiently, and they utilize resources evenly over time.
 72 | %
 73 | \item
 74 | %
 75 | Finally, Section~\ref{compaction:leader} discusses an approach to log
 76 | compaction that minimizes the mechanism required by storing snapshots
 77 | directly in the log. Though easier to implement, this approach is only
 78 | suitable for very small state machines.
 79 | %
 80 | \end{itemize}
 81 | %
 82 | LogCabin currently only implements the memory-based snapshotting
 83 | approach (it embeds a memory-based state machine).
 84 | 
 85 | The various approaches to compaction share several core concepts. First,
 86 | instead of centralizing compaction decisions on the leader, each server
 87 | compacts the
 88 | committed prefix of its log independently. This avoids having the leader
 89 | transmit data to followers that already have the data in their logs. It also
 90 | helps modularity: most of the complexity of log compaction is
 91 | contained within the state machine and does not interact much with Raft
 92 | itself. This helps keep overall system complexity to a
 93 | minimum: the complexity of Raft adds to, rather than multiplies with,
 94 | the complexity of log compaction. Alternative approaches that centralize compaction
 95 | responsibilities on a leader are discussed further in
 96 | Section~\ref{compaction:leader} (and for very small state machines, a
 97 | leader-based approach may be better).
 98 | 
 99 | Second, the basic interaction between the state machine and Raft
100 | involves transferring responsibility for a prefix of the log from Raft
101 | to the state machine.
102 | Sooner or later after applying entries, the state machine reflects those
103 | entries to disk in a way that can recover the current system state.
104 | Once it has done so, it tells Raft to discard the corresponding
105 | prefix of the log. 
106 | Before Raft can give up responsibility for the log prefix,
107 | it must save some of its own state describing the log prefix.
108 | Specifically, Raft retains the index and term of the last entry it
109 | discarded; this anchors the rest of the log in place after the state
110 | machine's state and allows the AppendEntries consistency check to
111 | continue to work (it needs the index and term for the entry preceding
112 | the first entry in the log). Raft also retains the latest configuration
113 | from the discarded log prefix in order to support cluster membership
114 | changes.
115 | 
116 | Third, once Raft has discarded a prefix of the log, the state machine
117 | takes on two new responsibilities. If the server restarts, the state
118 | machine will need to load the state corresponding to the discarded log
119 | entries from disk before it can apply any entries from the Raft log.
120 | In addition, the state machine may need to produce a consistent image of
121 | the state so that it can be sent to a slow follower
122 | (one whose log is far behind the leader's). It is not feasible
123 | to defer compaction  until log entries have been ``fully replicated'' to
124 | every member in the cluster, since a minority of slow followers must not
125 | keep the cluster from being fully available, and new servers can be
126 | added to the cluster at any time. Thus, slow followers or new servers
127 | will occasionally need to receive their initial states over the network.
128 | Raft detects this when the next entry needed in AppendEntries has
129 | already been discarded in the leader's log. In this case, the state
130 | machine must provide a consistent image of the state, which the leader
131 | then sends to the follower.
132 | 
133 | 
134 | 


--------------------------------------------------------------------------------
/compaction/leader.tex:
--------------------------------------------------------------------------------
  1 | \section{Alternative: leader-based approaches}
  2 | \label{compaction:leader}
  3 | 
  4 | The log compaction approaches presented in this chapter depart from Raft's
  5 | strong leader principle, since servers compact their logs without the
  6 | knowledge of the leader. However, we think this departure is justified.
  7 | While having a leader helps avoid conflicting decisions in reaching
  8 | consensus, consensus has already been reached when snapshotting, so no
  9 | decisions conflict. Data still only flows from leaders to followers,
 10 | but followers can now reorganize their data independently.
 11 | 
 12 | We also considered leader-based approaches to log compaction, but any
 13 | benefits are usually outweighed by performance considerations.
 14 | It would be wasteful for the leader to compact its log, then send the result
 15 | to the followers, when they could just as well compact their own logs
 16 | independently.
 17 | Sending the redundant state to each follower would waste network
 18 | bandwidth and slow the compaction process. Each follower already has the
 19 | information needed to compact its own state, and the leader's outbound
 20 | network bandwidth is usually Raft's most precious (bottleneck) resource.
 21 | For memory-based snapshots, it is typically much cheaper for a server to
 22 | produce a snapshot from its local state than it is to send and receive
 23 | one over the network. For incremental compaction approaches, this depends
 24 | a bit more on the hardware configuration, but we also expect independent
 25 | compaction to be cheaper.
 26 | 
 27 | \subsection{Storing snapshots in the log}
 28 | 
 29 | \begin{figure}
 30 | \centering
 31 | \includegraphics[scale=0.5]{compaction/logbased}
 32 | \vcaption[alternative: snapshot stored in log]{
 33 | A leader-based approach that stores the snapshot in chunks in the log,
 34 | interleaved with client requests. The snapshotting process is started at
 35 | the \emph{start} entry, and it completes by the \emph{end} entry.
 36 | The snapshot is stored in several log entries between \emph{start} and
 37 | \emph{end}.
 38 | %
 39 | So that client requests can proceed in parallel with snapshotting, each
 40 | entry is limited in size, and the rate at which the entries are appended
 41 | to the log is limited: the next snapshot chunk is only appended to the
 42 | log when the leader learns that the previous snapshot chunk has been
 43 | committed.
 44 | %
 45 | Once each server learns that the \emph{end} entry is committed, it can
 46 | discard the entries in its log up to the corresponding \emph{start}
 47 | entry. Replaying the log requires a two pass algorithm: the last
 48 | complete snapshot is applied first, then the client requests after the
 49 | snapshot's \emph{start} entry are applied.
 50 | }
 51 | \label{fig:compaction:logbased}
 52 | \end{figure}
 53 | 
 54 | One possible benefit to leader-based approaches is that, if all the
 55 | system state could be stored in the log, then new mechanisms to
 56 | replicate and persist the state would not be needed. Thus, we considered
 57 | a leader-based approach to snapshotting in which the leader would create
 58 | a snapshot and store the snapshot as entries in the Raft log, as shown
 59 | in Figure~\ref{fig:compaction:logbased}. The leader would then send this
 60 | snapshot to each of its followers using the AppendEntries RPC. 
 61 | To reduce any disruption on normal operation, each snapshot would be
 62 | split into many entries and interleaved with normal client commands in
 63 | the log.
 64 | 
 65 | This would achieve better economy of mechanism than storing the snapshot
 66 | outside the log, since servers would not need separate mechanisms to
 67 | transfer snapshots or persist them (they would be replicated and persisted
 68 | just like other log entries). However, in addition to wasting network
 69 | bandwidth for followers that could just as easily produce their own
 70 | snapshots, this has a serious problem. If a leader fails in the middle
 71 | of creating a snapshot, it leaves a partial snapshot in the servers'
 72 | logs. In principle this could happen repeatedly and exhaust servers'
 73 | storage capacity with garbage accumulated from numerous failed
 74 | snapshotting attempts. Thus, we don't think this mechanism is viable in
 75 | practice.
 76 | 
 77 | 
 78 | 
 79 | \subsection{Leader-based approach for very small state machines}
 80 | 
 81 | For very small state machines, storing the snapshot in the log not only
 82 | becomes viable but can also be simplified significantly. If the snapshot
 83 | is small enough (up to about one megabyte), it can fit comfortably in a
 84 | single log entry without interrupting normal operation for too long. To
 85 | compact the servers' logs in this way, the leader would:
 86 | \vspace{2ex}
 87 | \begin{compactenum}
 88 | \item Stop accepting new client requests;
 89 | \item Wait for all entries in its log to be committed and its state
 90 | machine to have applied all entries in its log;
 91 | \item Take a snapshot (synchronously);
 92 | \item Append the snapshot into a single log entry at the end of its log; and
 93 | \item Resume accepting new client requests.
 94 | \end{compactenum}
 95 | Once each server learned that the snapshot entry was committed, it could
 96 | discard every entry before the snapshot in its log. This approach would
 97 | cause a small availability gap while client requests were stopped and the
 98 | snapshot entry was transferred, but its impact would be limited for very
 99 | small state machines.
100 | 
101 | This simpler approach avoids
102 | the implementation effort of persisting snapshots outside the log,
103 | transferring them using a new RPC, and snapshotting concurrently.
104 | However, successful systems tend to be used more than their original
105 | designers intended, and this approach would not work well for larger
106 | state machines.
107 | 


--------------------------------------------------------------------------------
/compaction/rules.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongardie/dissertation/0c10ea1b53d73bf6d163bdde8e4141a1cbd8f36a/compaction/rules.pdf


--------------------------------------------------------------------------------
/conclusion/conclusion.tex:
--------------------------------------------------------------------------------
  1 | \chapter{Conclusion}
  2 | \label{conclusion}
  3 | 
  4 | Our goal with this dissertation was to create a better foundation for
  5 | learning consensus and building replicated state machines. When we set
  6 | out to learn consensus ourselves, we found the time and effort required
  7 | to understand existing algorithms was too high, and we worried that this
  8 | burden might be prohibitive for many students and practitioners. We were
  9 | also left with significant design work before we could build a complete
 10 | and practical system using consensus. Thus, we designed Raft as a more
 11 | understandable and practical algorithm to serve as a better foundation for
 12 | both learning and systems building.
 13 | 
 14 | 
 15 | Several aspects of Raft's design contribute to its understandability. At
 16 | a high level, the algorithm is decomposed differently from Paxos: it
 17 | first elects a leader, then the leader manages the replicated log. This
 18 | decomposition allows reasoning about Raft's different subproblems (leader
 19 | election, log replication, and safety) relatively independently, and
 20 | having a strong leader helps minimize state space complexity, as
 21 | conflicts can only arise when leadership changes. Raft's leader election
 22 | involves very little mechanism, relying on randomized timeouts to avoid
 23 | and resolve contention. A single round of RPCs produces a leader in the
 24 | common case, and the voting rules guarantee that the leader already has all
 25 | committed entries in its log, allowing
 26 | it to proceed directly with log replication. Raft's log replication is
 27 | also compact and simple to reason about, since it restricts the way
 28 | logs change over time and how they differ from each other.
 29 | %
 30 | 
 31 | Raft is well-suited for practical systems: it is described in enough
 32 | detail to implement without further refinement, it solves all the major
 33 | problems in a complete system, and it is efficient. Raft adopts a
 34 | different architecture that is more applicable for building systems:
 35 | consensus is often defined as agreement on a single value, but in Raft
 36 | we defined it in terms of a replicated log, as this is needed to build a
 37 | replicated state machine. Raft manages the replicated log efficiently by
 38 | leveraging its leader; committing a request requires just one round of
 39 | RPCs from the leader. Moreover, this dissertation has mapped out the
 40 | design space for all the major challenges in building a complete system:
 41 | %
 42 | \begin{itemize}
 43 | %
 44 | \item Raft allows changing the cluster membership by adding or removing
 45 | a single server at a time. These operations preserve safety simply,
 46 | since at least one server overlaps any majority during the change.
 47 | More complex changes in membership are implemented as a series of
 48 | single-server changes. Raft allows the cluster to continue operating
 49 | normally during changes.
 50 | %
 51 | \item Raft supports several ways to compact the log, including both
 52 | snapshotting and incremental approaches. Servers compact the committed
 53 | portions of their logs independently; the main idea involves
 54 | transferring responsibilities for the start of the log from Raft itself
 55 | to the server's state machine.
 56 | %
 57 | \item Client interaction is essential for the overall system to work
 58 | correctly. Raft provides linearizability for its client operations, and
 59 | read-only requests can bypass the replicated log for performance while
 60 | still providing the same consistency guarantees.
 61 | %
 62 | \end{itemize}
 63 | 
 64 | This dissertation analyzed and evaluated various aspects of Raft,
 65 | including understandability, correctness, and the performance of leader
 66 | election and log
 67 | replication. The user study showed that,
 68 | after students learned Raft or Paxos, 33 of 43 of them were able to
 69 | answer questions about Raft better, and 33 of 41 stated they thought
 70 | Raft would be easier to implement or explain than Paxos. The proof of
 71 | safety helps establish Raft's correctness, and the formal specification
 72 | is useful for practitioners, as it eliminates any ambiguities in Raft's
 73 | description. The randomized leader election algorithm was shown
 74 | to work well in a variety of scenarios, typically electing a leader
 75 | in less than one second. Finally, measurements showed that the current version of
 76 | LogCabin can sustain about \num{20000} kilobyte-sized writes per second
 77 | with a three-server cluster.
 78 | 
 79 | We are encouraged by Raft's fast adoption in industry, which we believe
 80 | stems from its understandability and its practicality. One person's
 81 | dilemma highlights both the problems that Raft set out to solve and the
 82 | benefits that it offers. Nate Hardt
 83 | built a Paxos-based system at Scale Computing
 84 | and had been struggling over the past year to iron out the issues with
 85 | his implementation. He is now close to having an efficient, working
 86 | system, but after discovering Raft, he is considering rebuilding the
 87 | system with Raft. He believes his team would be able to more readily
 88 | help with a Raft implementation, since they can understand the algorithm
 89 | more easily and learn about all of the aspects of a complete system.
 90 | Fortunately, others starting on new consensus projects have an easier
 91 | choice. Many have already been inspired to build Raft systems just for
 92 | the pleasure of learning, speaking to its understandability; others are
 93 | implementing Raft for production use, speaking to its practicality.
 94 | 
 95 | 
 96 | \section{Lessons learned}
 97 | 
 98 | 
 99 | I have learned many things during my years in graduate school, from how
100 | to build production-quality systems to how to conduct research. In this
101 | section I briefly describe some of the important lessons that I can pass
102 | on to other researchers and systems-builders.
103 | 
104 | \subsection{On complexity}
105 | 
106 | John once told me I had a ``high tolerance for complexity.'' At first I
107 | thought that was a compliment, that I could handle things that lesser
108 | humans could not. Then I realized it was a criticism. Though my ideas
109 | and code solved the problems they were meant to address, they introduced
110 | an entirely new set of problems: they would be difficult to explain,
111 | learn, maintain, and extend.
112 | 
113 | With Raft, we were intentionally intolerant of complexity and
114 | put that to good use. We set out to address the inherently complex problem
115 | of distributed consensus with the most understandable possible solution.
116 | Although this required managing a large amount of complexity, it worked
117 | towards minimizing that complexity for others.
118 | 
119 | Every system has a \emph{complexity budget}: the system offers some
120 | benefits for its users, but if its complexity outweighs these benefits,
121 | then the system is no longer worthwhile.
122 | Distributed consensus is a problem that is fundamentally complex, and a
123 | large chunk of its complexity budget must be spent just to arrive at a
124 | complete and working solution. I think many consensus algorithms and
125 | systems before Raft have exhausted their complexity budgets, and this
126 | might explain why few consensus-based systems are readily available. I
127 | hope Raft has changed this calculation and made these systems worth
128 | building.
129 | 
130 | \subsection{On bridging theory and practice}
131 | 
132 | We started this work because we wanted to build a system using consensus
133 | and found that it was surprisingly hard to do. This resonated with many
134 | others that had tried consensus and had given up on it in the past, but
135 | its value was lost on many academics. By making things simple and
136 | obvious, Raft appears almost uninteresting to academics. The academic
137 | community has not considered understandability per se to be an important
138 | contribution; they want novelty in some other dimension.
139 | 
140 | Academia should be more open to work that bridges the gap between theory
141 | and practice. This type of work may not bring any new
142 | \emph{functionality} in theory, but it does give a larger number of
143 | students and practitioners a new \emph{capability}, or at least
144 | substantially reduces their burden. The question of ``Would I teach,
145 | use, and recommend this work?'' is too often ignored, when, ultimately,
146 | it matters to our field.
147 | 
148 | 
149 | The task of bridging the gap often needs to come from academic research.
150 | In industry, deadlines to ship products usually lead practitioners to ad
151 | hoc solutions that are just good enough to meet their needs. They can
152 | point out challenges (as the Chubby authors did with Paxos~\cite{Chandra:2007}), but they cannot
153 | usually invest the time needed to find the best solutions. With Raft, we
154 | weren't content with good enough, and we think that is what makes our
155 | work valuable. We explored all the design choices we could think of;
156 | this took careful study at a depth that is difficult to accommodate in
157 | industry, but it produced a valuable result that many others can benefit
158 | from.
159 | 
160 | \subsection{On finding research problems}
161 | 
162 | When I started graduate school, I did not know how to find interesting
163 | research problems to work on. This seems silly to me now, as there are
164 | too many problems out there. There are various approaches to finding
165 | them, but I have found this one to be effective:
166 | %
167 | \begin{itemize}
168 | %
169 | \item First, start building something. I do not think it matters much
170 | what this something is, as long as you are motivated to build it. For
171 | example, you might choose to build an application you would like to have
172 | or rewrite an existing project in a new programming language you would
173 | like to learn.
174 | %
175 | \item Second, pick a metric and optimize your system for it. For Raft,
176 | we set out to design the most understandable algorithm. Other projects
177 | optimize for performance, security, correctness, usability, or a number
178 | of other metrics.
179 | %
180 | \end{itemize}
181 | 
182 | The key to this approach is to ask, at every step of the way, what is
183 | the absolute best possible way to maximize your metric? This inevitably
184 | leads to either discovering something new to learn, or quite often,
185 | finding that no existing solution is quite good enough---a potential
186 | research project.
187 | 
188 | The problem then shifts from not having any problems to work on to
189 | having too many, and the challenge becomes deciding which one(s) to
190 | choose. This can pose a difficult judgment call; I suggest looking for
191 | projects that are conceptually interesting, are exciting to work on, and
192 | have the potential for significant impact.
193 | 
194 | \section{Final comments}
195 | 
196 | This dissertation aims to bridge the gap between theory and practice in
197 | distributed consensus. Much of the prior academic work on distributed
198 | consensus has been theoretical in nature and difficult to apply to
199 | building practical systems. Meanwhile, many of the real-world systems
200 | based on consensus have been ad hoc in nature, where
201 | practitioners have stopped at solutions that were good enough for their
202 | needs, and their implications and alternatives were not fully explored.
203 | In Raft, we have thoroughly explored the design space for a complete
204 | consensus algorithm with a focus on understandability, and we have also
205 | built a complete consensus-based system in order to ensure that our
206 | ideas are practical. We hope this will serve as a good foundation both
207 | for teaching consensus and for building future systems.
208 | 


--------------------------------------------------------------------------------
/intro/intro.tex:
--------------------------------------------------------------------------------
  1 | \chapter{Introduction}
  2 | \label{introduction}
  3 | 
  4 | 
  5 | Today's datacenter systems and applications run in highly dynamic
  6 | environments. They scale out by leveraging the resources of additional
  7 | servers, and they grow and shrink according to demand. Server and
  8 | network failures are also commonplace:
  9 | about 2--4\% of disk drives fail each year~\cite{Schroeder:2007},
 10 | servers crash about as often~\cite{Dean:2009}, and tens of network links
 11 | fail every day in modern datacenters~\cite{Gill:2011}.
 12 | 
 13 | As a result, systems must deal with servers coming and going during
 14 | normal operations. They must react to changes and adapt automatically within
 15 | seconds; outages that are noticeable to humans are typically not acceptable.
 16 | This is a major challenge in today's systems; failure handling,
 17 | coordination, service discovery, and configuration management are all
 18 | difficult in such dynamic environments.
 19 | 
 20 | Fortunately, distributed consensus can help with these challenges.
 21 | Consensus allows a collection of machines to work as a coherent group
 22 | that can survive the failures of some of its members. Within a consensus
 23 | group, failures are handled in a principled and proven way. Because
 24 | consensus groups are highly available and reliable, other system
 25 | components can use a consensus group as the foundation for their own
 26 | fault tolerance. Thus, consensus plays a key role in building reliable
 27 | large-scale software systems.
 28 | 
 29 | When we started this work, the need for consensus was becoming clear,
 30 | but many systems still struggled with problems that consensus could
 31 | solve. Some large-scale systems were still limited by a single
 32 | coordination server as a single point of failure (e.g.,
 33 | HDFS~\cite{Hadoop2Release,HDFSHA}). Many others included ad hoc
 34 | replication algorithms that handled failures unsafely (e.g., MongoDB and
 35 | Redis~\cite{Kingsbury:Jepsen}). New systems had few options for readily
 36 | available consensus implementations (ZooKeeper~\cite{Hunt:2010} was the
 37 | most popular), forcing systems builders to conform to one or build their
 38 | own.
 39 | 
 40 | Those choosing to implement consensus themselves usually turned to
 41 | Paxos~\cite{Lamport:1998, Lamport:2001}. Paxos had dominated the
 42 | discussion of consensus algorithms over the last two decades: most
 43 | implementations of consensus were based on Paxos or influenced by it,
 44 | and Paxos had become the primary vehicle used to teach students about
 45 | consensus.
 46 | 
 47 | Unfortunately, Paxos is quite difficult to understand, in spite of
 48 | numerous attempts to make it more approachable. Furthermore, its
 49 | architecture requires complex changes to support practical systems,
 50 | and building a complete system based on Paxos requires developing several
 51 | extensions for which the details have not been published or agreed upon. As a
 52 | result, both system builders and students struggle with Paxos.
 53 | 
 54 | The two other well-known consensus algorithms are Viewstamped
 55 | Replication~\cite{Oki:1988,Oki:1988t,Liskov:2012} and
 56 | Zab~\cite{Junqueira:2011}, the algorithm used in ZooKeeper. Although we
 57 | believe both of these algorithms are incidentally better in structure
 58 | that Paxos for building systems, neither has explicitly made this
 59 | argument; they were not designed with simplicity or understandability as
 60 | a primary goal. The burden of understanding and implementing these
 61 | algorithms is still too high.
 62 | 
 63 | Each of these consensus options was difficult to understand and
 64 | difficult to implement. Unfortunately, when the cost of implementing
 65 | consensus with proven algorithms was too high, systems builders were
 66 | left with a tough decision. They could avoid consensus altogether,
 67 | sacrificing the fault tolerance or consistency of their systems, or they
 68 | could develop their own ad hoc algorithm, often leading to
 69 | unsafe behavior. Moreover, when the cost of explaining and
 70 | understanding consensus was too high, not all instructors attempted to
 71 | teach it, and not all students succeeded in learning it. Consensus is as
 72 | fundamental as two-phase commit; ideally, as many
 73 | students should learn it (even though consensus is fundamentally more
 74 | difficult).
 75 | 
 76 | After struggling with Paxos ourselves, we set out to find a
 77 | new consensus algorithm that could provide a better foundation for
 78 | system building and education. Our approach was unusual in that our
 79 | primary goal was \emph{understandability}: could we define a consensus
 80 | algorithm for practical systems and describe it in a way that is
 81 | significantly easier to learn than Paxos? Furthermore, we wanted the
 82 | algorithm to facilitate the development of intuitions that are essential
 83 | for system builders. It was important not just for the algorithm to
 84 | work, but for it to be obvious why it works.
 85 | 
 86 | This algorithm also had to be complete enough to address all aspects of
 87 | building a practical system, and it had to perform well enough for
 88 | practical deployments. The core algorithm not only had to specify the
 89 | effects of receiving a message but also describe what \emph{should}
 90 | happen and when; these are equally important for systems builders.
 91 | Similarly, it had to guarantee consistency, and it also had to provide
 92 | availability whenever possible. It also had to address the many aspects
 93 | of a system that go beyond reaching consensus, such as changing the
 94 | members of the consensus group. These are necessary in practice, and
 95 | leaving this burden to systems builders would risk ad hoc, suboptimal,
 96 | or even incorrect solutions.
 97 | 
 98 | The result of this work is a consensus algorithm called Raft. In
 99 | designing Raft we applied specific techniques to improve
100 | understandability, including decomposition (Raft separates leader
101 | election, log replication, and safety) and state space reduction (Raft
102 | reduces the degree of nondeterminism and the ways servers can be
103 | inconsistent with each other). We also addressed all of the issues needed to
104 | build a complete consensus-based system. We considered each design
105 | choice carefully, not just for the benefit of our own implementation but
106 | also for the many others we hope to enable.
107 | 
108 | We believe that Raft is superior to Paxos and other consensus
109 | algorithms, both for educational purposes and as a foundation for
110 | implementation. It is simpler and more understandable than other
111 | algorithms; it is described completely enough to meet the needs of a
112 | practical system; it has several open-source implementations and is used
113 | by several companies; its safety properties have been formally specified
114 | and proven; and its efficiency is comparable to other algorithms.
115 | 
116 | The primary contributions of this dissertation are as follows:
117 | %
118 | \begin{itemize}
119 | %
120 | \item The design, implementation, and evaluation of the Raft consensus
121 | algorithm. Raft is similar in many ways to existing consensus algorithms
122 | (most notably, Oki and Liskov's Viewstamped Replication~\cite{Oki:1988,
123 | Liskov:2012}), but it is designed for understandability. This led to
124 | several novel features. For example, Raft uses a stronger form of
125 | leadership than other consensus algorithms.
126 | This simplifies the management of the replicated log and makes Raft
127 | easier to understand.
128 | %
129 | \item The evaluation of Raft's understandability. A user study with 43
130 | students at two universities shows that Raft is significantly easier to
131 | understand than Paxos: after learning both algorithms, 33 of these
132 | students were able to answer questions about Raft better than questions
133 | about Paxos. We believe this is the first scientific study to evaluate
134 | consensus algorithms based on teaching and learning.
135 | %
136 | \item The design, implementation, and evaluation of Raft's leader
137 | election mechanism.  While many consensus algorithms do not prescribe a
138 | particular leader election algorithm, Raft includes a specific algorithm
139 | involving randomized timers. This adds only a small amount of mechanism
140 | to the heartbeats already required for any consensus algorithm, while
141 | resolving conflicts simply and rapidly. The evaluation of leader
142 | election investigates its behavior and performance, concluding that this simple
143 | approach is sufficient in a wide variety of practical environments. It
144 | typically elects a leader in under 20 times the cluster's one-way
145 | network latency.
146 | %
147 | \item The design and implementation of Raft's cluster membership change
148 | mechanism. Raft allows adding or removing a single server at a time;
149 | these operations preserve safety simply, since at least one server
150 | overlaps any majority during the change. More complex changes in
151 | membership are implemented as a series of single-server changes.
152 | Raft allows the
153 | cluster to continue operating normally during changes, and membership
154 | changes can be implemented with only a few extensions to the basic
155 | consensus algorithm.
156 | %
157 | \item A thorough discussion and implementation of the other components
158 | necessary for a complete consensus-based system, including client
159 | interaction and log compaction. Although we do not believe these aspects
160 | of Raft to be particularly novel, a complete description is important
161 | for understandability and to enable others to build real systems. We have
162 | implemented a complete consensus-based service to explore and address
163 | all of the design decisions involved.
164 | %
165 | \item A proof of safety and formal specification for the Raft algorithm.
166 | The level of precision in the formal specification aids in reasoning
167 | carefully about the algorithm and clarifying details in the algorithm's
168 | informal description. The proof of safety helps build confidence in
169 | Raft's correctness. It also aids others who wish to extend Raft by
170 | clarifying the implications for safety of their extensions.
171 | %
172 | \end{itemize}
173 | 
174 | We have implemented many of the designs in this dissertation in
175 | an open-source implementation of Raft called LogCabin~\cite{logcabin}.
176 | LogCabin served as our test platform for new ideas in Raft
177 | and as a way to verify that we understood the issues of building a
178 | complete and practical system. The implementation is described in more
179 | detail in Chapter~\ref{performance}.
180 | 
181 | The remainder of this dissertation introduces the replicated state
182 | machine problem and discusses the strengths and weaknesses of Paxos
183 | (Chapter~\ref{motivation}); presents the Raft consensus algorithm, 
184 | its extensions for cluster membership changes and log compaction, and
185 | how clients interact with Raft
186 | (Chapters~\ref{basicraft}--\ref{clients});
187 | evaluates Raft for understandability, correctness, and leader election
188 | and log replication performance
189 | (Chapters~\ref{userstudy}--\ref{performance}); and discusses related
190 | work (Chapter~\ref{related}).
191 | 


--------------------------------------------------------------------------------
/leaderelection/conclusion.tex:
--------------------------------------------------------------------------------
 1 | \section{Conclusion}
 2 | \label{leaderelection:conclusion}
 3 | 
 4 | Raft's leader election algorithm performs well in a wide variety of
 5 | scenarios. It is able to elect leaders within tens of milliseconds on
 6 | average on a real-world LAN. When election timeouts are chosen randomly
 7 | from a range of 10--20 times the one-way network latency, leaders are
 8 | elected within about 20 times the one-way network latency on average.
 9 | Tail election times are also fairly short. For example, 99.9\% of
10 | elections complete in less than \SI{3}{seconds} when the one-way network
11 | latency is as high as \SIrange{30}{40}{\milli\second}.
12 | 
13 | This chapter answered most of the basic questions about how Raft's
14 | leader election algorithm performs. Further analysis is required to
15 | answer the following additional questions:
16 | %
17 | \begin{compactitem}
18 | %
19 | \item How much longer does leader election take when servers start with
20 | different initial current term numbers?
21 | %
22 | \item How does leader election perform in asymmetric networks, where
23 | each link has a different latency?
24 | %
25 | \item How well does leader election work on networks with severe packet
26 | loss?
27 | %
28 | \item How well does leader election work when servers experience
29 | severe clock drift?
30 | %
31 | \end{compactitem}
32 | 
33 | Another interesting area of research would be to explore setting
34 | election timeouts dynamically. Raft's leader election performance
35 | depends on a properly configured election timeout, and it would be nice
36 | to configure this election timeout automatically and dynamically.
37 | However, we do not know how leader election will perform if different
38 | servers use different election timeout ranges (this is related to the
39 | clock drift question above).
40 | 


--------------------------------------------------------------------------------
/leaderelection/leaderelection.tex:
--------------------------------------------------------------------------------
 1 | \chapter{Leader election evaluation}
 2 | \label{leaderelection}
 3 | 
 4 | \newcommand\algo[1]{\emph{#1} algorithm}
 5 | \newcommand\algocap[1]{\emph{#1} algorithm}
 6 | \newcommand\algoabbrv[1]{\emph{#1} algo.}
 7 | 
 8 | This chapter analyzes the performance of leader election in Raft, which
 9 | occurs when a leader fails and needs to be
10 | replaced. Although we expect leader failures to be a rare event, they
11 | should be handled in a timely manner. We would like Raft to reliably
12 | elect a new leader in a fraction of a second in a typical deployment.
13 | 
14 | Unfortunately, it is difficult to put a bound on the time or number of
15 | messages leader election will take. According to the FLP impossibility
16 | result~\cite{Fischer:1985}, no
17 | fault-tolerant consensus protocol can deterministically terminate in a
18 | purely asynchronous model. This manifests itself in split votes in Raft,
19 | which can potentially impede progress repeatedly during leader election.
20 | Raft also makes use of
21 | randomized timeouts during leader election, which makes its analysis
22 | probabilistic. Thus, we can only say that leader election
23 | performs well with high likelihood, and even then only under various
24 | assumptions. For example, servers must choose timeouts from a random
25 | distribution (they are not somehow synchronized), clocks must proceed at
26 | about the same rates, and servers and networks must be timely (or
27 | stopped). If these assumptions are not met for some period of time, the
28 | cluster might not be able to elect a leader during that period
29 | (though safety will always be maintained).
30 | 
31 | This chapter draws the following conclusions about the performance of
32 | Raft's leader election algorithm:
33 | %
34 | \begin{itemize}
35 | %
36 | \item When no split vote occurs, elections complete about one third of
37 | the way into the election timeout range, on average. They complete
38 | slightly faster in clusters with more available servers, since the first
39 | server is expected to time out sooner.
40 | (Section~\ref{leaderelection:nosplit})
41 | %
42 | \item Split vote rates are low when the election timeout range is
43 | sufficiently broad. We recommend a range that is
44 | 10--20 times the one-way
45 | network latency, which keeps split votes rates under 40\% in all cases
46 | for reasonably sized clusters, and typically results in much lower rates.
47 | Clusters will experience more split votes as more servers fail, since
48 | fewer votes are available. (Section~\ref{leaderelection:split:rate})
49 | %
50 | \item The number of election terms required to elect a leader follows a
51 | geometric distribution,
52 | where the expected number is $\dfrac{1}{1-\text{split vote rate}}$.
53 | Thus, even a high split vote rate of 50\% will only need two election
54 | terms on average to elect a leader.
55 | A cluster configured with an election timeout that is 10--20 times its
56 | one-way network latency will be able to elect a leader in less than 20
57 | times its one-way network latency on average.
58 | (Section~\ref{leaderelection:split:total})
59 | %
60 | \item Leader election performs well in practice in both local and wide
61 | area networks. In a real-world LAN, our system was able to elect a
62 | leader in an average of \SI{35}{\milli\second} when configured with aggressive
63 | timeouts, though we suggest using a more conservative timeout range in
64 | practice. On a simulated WAN spanning the US, elections typically
65 | complete in half a second, and 99.9\% of elections complete in
66 | \SI{3}{seconds}, even when two of five servers have failed.
67 | (Section~\ref{leaderelection:lan})
68 | %
69 | \item The performance of leader election is not substantially affected
70 | by the log comparison in RequestVote RPCs, when some servers will not
71 | grant their votes to others. (Section~\ref{leaderelection:logsdiff})
72 | %
73 | \item The basic leader election algorithm can cause disruptions if
74 | followers lose connectivity, increment their terms, and then regain
75 | connectivity. Section~\ref{leaderelection:prevote} extends the basic
76 | algorithm with an additional phase to avoid such disruptions.
77 | %
78 | \end{itemize}
79 | 
80 | \input{leaderelection/nosplit}
81 | \input{leaderelection/splitrate}
82 | \input{leaderelection/splittotal}
83 | \input{leaderelection/real}
84 | \input{leaderelection/logsdiff}
85 | \input{leaderelection/prevote}
86 | \input{leaderelection/conclusion}
87 | 


--------------------------------------------------------------------------------
/leaderelection/logsdiff.tex:
--------------------------------------------------------------------------------
 1 | \section{What happens when logs differ?}
 2 | \label{leaderelection:logsdiff}
 3 | 
 4 | Most of this chapter has assumed that servers grant their votes
 5 | on a purely first-come-first-served basis. In reality, Raft restricts
 6 | how servers may grant votes: the RequestVote RPC contains information
 7 | about the candidate's log, and a voter does not grant its vote or reset
 8 | its election timer if the voter's log is more up-to-date than the
 9 | candidate's.
10 | 
11 | We used AvailSim to investigate what effect, if any, this voting
12 | restriction has on leader election performance. The simulation was
13 | configured with the same WAN network as in
14 | Section~\ref{leaderelection:lan}, but each server was configured with a
15 | different log. Thus, only three, two, or one of the five servers were eligible to
16 | become leader, depending on whether zero, one, or two of the servers had failed.
17 | 
18 | 
19 | 
20 | Figure~\ref{fig:leaderelection:simulation:dist:submission-failures-logsdiff}
21 | shows the results; performance is very similar to when the servers had
22 | equal logs. The curves do have slightly different shapes (they have
23 | sharper corners), but the effect is small. Thus, we do not believe the
24 | log comparison adversely affects leader election performance.
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/leaderelection/nosplit.tex:
--------------------------------------------------------------------------------
  1 | \section{How fast will Raft elect a leader with no split votes?}
  2 | \label{leaderelection:nosplit}
  3 | 
  4 | \begin{figure}
  5 | \centering
  6 | \includegraphics[scale=.55]{leaderelection/timeline}
  7 | \vcaption[leader election timeline with no split votes]{
  8 | Timeline of a typical election when no split vote occurs.
  9 | The first candidate to time out successfully collects votes and
 10 | completes the election (other elections may not be so fortunate). The
 11 | figure is drawn to scale assuming the election timeouts are chosen from
 12 | a range between 10--20 times the cluster's one-way network latency.
 13 | \\
 14 | The ``old leader heartbeats'' row shows the final heartbeat that the
 15 | old leader completes, and when it would have sent its next heartbeats
 16 | were it not to crash.
 17 | \\
 18 | The ``old leader crash'' row shows the interval during which the old
 19 | leader crashes. This time is assumed to follow a uniform random
 20 | distribution within its heartbeat interval. The vertical line halfway
 21 | through the interval is its expected (average) value.
 22 | \\
 23 | The ``base election timeout'' row shows the interval during which all
 24 | the followers await additional heartbeats from the old leader.
 25 | \\
 26 | The ``election timeout range'' row shows the interval during which the
 27 | servers would time out and start elections to replace the old leader.
 28 | The vertical lines show expected earliest timeout values for different
 29 | numbers of remaining servers (eight, four, and two, respectively).
 30 | \\
 31 | The ``requests for votes'' row shows when the candidate sends its
 32 | RequestVote RPCs to the other servers and receives their votes.
 33 | \\
 34 | The ``new leader heartbeats'' row shows the new leader sending out
 35 | heartbeat RPCs right away after becoming leader, then periodically
 36 | thereafter.
 37 | }
 38 | \label{fig:leaderelection:nosplit:timeline}
 39 | \end{figure}
 40 | 
 41 | The most common case for leader election in Raft is when no split vote
 42 | occurs, and this section analyzes how long it takes to elect a leader
 43 | under that assumption. This is expected to be the normal case for Raft
 44 | clusters; if the cluster is configured correctly, most normal elections
 45 | will not encounter a split vote. The first server to time out will be
 46 | able to collect votes from a majority of the cluster and become leader.
 47 | The timeline of events is shown in
 48 | Figure~\ref{fig:leaderelection:nosplit:timeline}. 
 49 | 
 50 | \begin{table}
 51 | \centering
 52 | \begin{tabular}{ccl}
 53 | variable & type & meaning \\
 54 | \hline
 55 | \noalign{\vskip .75ex}
 56 | $s$     & natural & number of available servers \\
 57 | $n$     & natural & size of full cluster (including unavailable servers) \\
 58 | $c$     & natural & number of servers to time out near each other \\
 59 | $l$     & time & constant half round trip network latency (special case of $L$)\\
 60 | $L$     & random variable of time & half round trip network latency \\
 61 | $W$     & random variable of time & time to write term and vote durably to disk \\
 62 | $T_i$   & random variable of time & timeout of server $i$ \\
 63 | $M_s$   & random variable of time & earliest timeout of $s$ servers \\
 64 | $D_{c,s}$ & random variable of time & difference in timeouts of earliest $c$ of $s$ servers \\
 65 | $E_s$   & random variable of time & time to complete an election \\
 66 | \end{tabular}
 67 | \vcaption[summary of variables]{
 68 | Summary of the variables used throughout this chapter to
 69 | analyze leader election performance.
 70 | Times are normalized to the election timeout range (ranging from 0 to
 71 | 1).
 72 | }
 73 | \label{tab:leaderelection:variables}
 74 | \end{table}
 75 | 
 76 | \begin{figure}
 77 | \centering
 78 | \includegraphics[scale=.5]{leaderelection/earliesttimeoutdiagram}
 79 | \vcaption[earliest timeout example]{
 80 | What is the smallest random election timeout value chosen by $s$
 81 | servers? The diagram shows random election timeouts a five-server
 82 | cluster where one server has failed ($s=4$). $t_{(1)}$ is the smallest
 83 | timeout value chosen.
 84 | }
 85 | \label{fig:leaderelection:theory:earliesttimeoutdiagram}
 86 | \end{figure}
 87 | 
 88 | With no split votes, the time it takes to elect a leader is determined
 89 | by how long it takes the first server to time out. The question of when
 90 | it will time out is illustrated in
 91 | Figure~\ref{fig:leaderelection:theory:earliesttimeoutdiagram}. Each server
 92 | waits for a uniform random timeout after the last time it received a
 93 | heartbeat. Intuitively, any individual server is expected to time out
 94 | halfway through the election timeout range, but with more servers it
 95 | becomes more likely that the first server will time out sooner.
 96 | 
 97 | We now define the problem more precisely and derive when the first
 98 | server times out analytically. The variables defined in this chapter are
 99 | summarized in Table~\ref{tab:leaderelection:variables}. Suppose each
100 | server chooses its timeouts randomly from the standard
101 | uniform distribution (in the range $[0,1]$). Let $T_1 \ldots T_s$ be
102 | random variables representing when each of $s$ servers times out. Let $M_s$ be the minimum of
103 | $T_1 \ldots T_s$, a random variable representing the time the first
104 | server times out. Its cumulative distribution function (CDF) defines the
105 | probability that $M_s$ is no greater than a particular time, $t$. This
106 | is equivalent to one minus the probability that all servers times out
107 | after $t$:
108 | \begin{align*}
109 | \Pr(M_s \leq t)
110 |         &= 1 - \Pr(M_s > t) \\
111 |         &= 1 - \prod_{i=1}^s \Pr(T_i > t) \\
112 |         &= 1 - \prod_{i=1}^s (1 - t) \\
113 |         &= 1 - (1-t)^s
114 | \end{align*}
115 | %
116 | For example, consider a cluster with five servers where the prior
117 | leader has failed.
118 | The probability that the earliest of the remaining four servers times out
119 | sometime in the first quarter of the election timeout range is
120 | $\Pr(M_4 \leq \frac{1}{4}) = 1 - (1 - \frac{1}{4})^4 \approx 0.68$.
121 | The CDF is graphed in
122 | Figure~\ref{fig:leaderelection:theory:model:earliesttimeout}
123 | for various values of $s$.
124 | 
125 | \begin{figure}
126 | \centering
127 | \includegraphics{leaderelection/earliesttimeout}
128 | \vspace{-2ex}
129 | \vcaption[earliest timeout CDF]{
130 | The graph shows the probability that the earliest server times out
131 | before $t$ when different numbers of servers are available. The point on
132 | each line shows the time when the first server is expected to time out
133 | ($\Ex[M_s]$).
134 | }
135 | \label{fig:leaderelection:theory:model:earliesttimeout}
136 | \end{figure}
137 | 
138 | The probability density function (PDF) of $M_s$ is the derivative of the
139 | CDF:
140 | \begin{align*}
141 |  f_{M_s}(t) &= \frac{d}{dt} \Pr(M_s \leq t) \\
142 |         &= \frac{d}{dt} (1-(1-t)^s) \\
143 |         &= -\frac{d}{dt} (1-t)^s \\
144 |         &= s (1-t)^{s-1}
145 | \end{align*}
146 | 
147 | The expected value (mean) of $M_s$ is calculated from the PDF:
148 | \begin{align*}
149 |  \Ex[M_s] &= \int_0^1 \! t \, f_{M_s}(t) \, dt \\
150 |           &= \int_0^1 \! t (s (1-t)^{s-1}) \, dt \\
151 |           &= \left. \! -\frac{(1-t)^s (s\,t+1)}{s+1} \right|_{t=0}^1 \\
152 |           &= \frac{1}{s+1}
153 | \end{align*}
154 | 
155 | \noindent
156 | For example, with four available servers, the first timeout is expected
157 | to occur $\dfrac{1}{5}^\textrm{th}$ of the way through the election
158 | timeout range. Fortunately, this very simple expression is a good
159 | estimate of Raft's overall election performance, since elections
160 | complete soon after the first candidate times out when no split vote
161 | occurs.
162 | 
163 | More precisely, if there is no split vote, the full election requires
164 | a candidate to time out and request votes, once the leader crashes:
165 | \begin{align*}
166 | E_s &= \text{baseline election timeout} + M_s + \text{time to request
167 | votes} - \text{heartbeat adjustment} \\
168 | E_s &= 1 + M_s + 2L + W - U(0,\dfrac{1}{2}) \\
169 | \Ex[E_s] &= 1 + \frac{1}{s+1} + 2\Ex[L] + \Ex[W] - \dfrac{1}{4}
170 | \end{align*}
171 | where election timeouts are chosen from the range $[1,2]$,
172 | $L$ is the network latency, and $W$ is the time to write the votes
173 | persistently to disk.
174 | A uniform random time value from the range $[0, \dfrac{1}{2}]$ is
175 | subtracted, since leaders are expected to crash randomly within their
176 | heartbeat intervals rather than immediately after sending heartbeats.
177 | 


--------------------------------------------------------------------------------
/leaderelection/prevote.tex:
--------------------------------------------------------------------------------
 1 | \section{Preventing disruptions when a server rejoins the cluster}
 2 | \label{leaderelection:prevote}
 3 | 
 4 | One downside of Raft's leader election algorithm is that a server that
 5 | has been partitioned from the cluster is likely to cause a disruption
 6 | when it regains connectivity. When a server is partitioned, it will not
 7 | receive heartbeats. It will soon increment its term to start an
 8 | election, although it won't be able to collect enough votes to become
 9 | leader. When the server regains connectivity sometime later, its larger
10 | term number will propagate to the rest of the cluster (either through
11 | the server's RequestVote requests or through its AppendEntries
12 | response). This will force the cluster leader to step down, and a new
13 | election will have to take place to select a new leader. Fortunately,
14 | such events are likely to be rare, and each will only cause one leader to
15 | step down.
16 | 
17 | If desired, Raft's basic leader election algorithm can be extended with
18 | an additional phase to prevent such disruptions, forming the Pre-Vote
19 | algorithm. In the Pre-Vote algorithm, a candidate only increments its
20 | term if it first learns from a majority of the cluster that they would
21 | be willing to grant the candidate their votes (if the candidate's log is
22 | sufficiently up-to-date, and the voters have not received heartbeats from
23 | a valid leader for at least a baseline election timeout). This was
24 | inspired by ZooKeeper's algorithm~\cite{Junqueira:2011}, in which a
25 | server must receive a majority of votes before it calculates a new epoch
26 | and sends NewEpoch messages (however, in ZooKeeper servers do not
27 | solicit votes, other servers offer them).
28 | 
29 | The Pre-Vote algorithm solves the issue of a partitioned server
30 | disrupting the cluster when it rejoins. While a server is partitioned,
31 | it won't be able to increment its term, since it can't receive
32 | permission from a majority of the cluster. Then, when it rejoins the
33 | cluster, it still won't be able to increment its term, since the other
34 | servers will have been receiving regular heartbeats from the leader.
35 | Once the server receives a heartbeat from the leader itself, it will
36 | return to the follower state (in the same term).
37 | 
38 | We recommend the Pre-Vote extension in deployments that would benefit
39 | from additional robustness. We also tested it in various leader
40 | election scenarios in AvailSim, and it does not appear to significantly
41 | harm election performance.
42 | 
43 | 


--------------------------------------------------------------------------------
/leaderelection/real.tex:
--------------------------------------------------------------------------------
  1 | \section{How fast will the complete Raft algorithm elect a leader in
  2 | real networks?}
  3 | \label{leaderelection:lan}
  4 | 
  5 | 
  6 | The previous sections were based on simplified models of how leader
  7 | election works in Raft. We wanted to know how fast Raft will be able to
  8 | elect a leader in the real world. To find out, this section evaluates
  9 | Raft's leader election algorithm using a real-world benchmark in a LAN
 10 | environment and a realistic simulator in a slower WAN environment.
 11 | 
 12 | \subsubsection{Real-world implementation on a LAN}
 13 | 
 14 | \begin{table}
 15 | \centering
 16 | \begin{tabular}{r l}
 17 | code & LogCabin~\cite{logcabin}, written in C++11 \\
 18 | OS & x86-64 RHEL6 (Linux 2.6.32) \\
 19 | CPU & Xeon X3470 (4 cores, 8 hyperthreads) \\
 20 | disk & ext4 file system on Crucial M4 SSDs (1 SSD per server) \\
 21 | network & Protocol Buffers~\cite{Varda:2008} over TCP/IP over 1~gigabit Ethernet \\
 22 | configuration & in-memory state machine, no log compaction \\
 23 | \end{tabular}
 24 | \vcaption[experimental setup for benchmark]{
 25 | Experimental setup for real-world LAN benchmark.
 26 | }
 27 | \label{tab:leaderelection:benchmarksetup}
 28 | \end{table}
 29 | 
 30 | We used LogCabin to measure the performance of Raft's
 31 | leader election algorithm on five servers connected by a gigabit Ethernet network.
 32 | The experimental setup is summarized in
 33 | Table~\ref{tab:leaderelection:benchmarksetup}. The benchmark repeatedly
 34 | crashed the leader of a cluster of five servers and timed how long it
 35 | took to detect the crash and elect a new leader. The benchmark measured
 36 | the time from when the old leader crashed until the other servers
 37 | received the new leader's first heartbeat (see
 38 | Figure~\ref{fig:leaderelection:nosplit:timeline}). The leader was crashed
 39 | randomly within its heartbeat interval, which was half of the
 40 | minimum election timeout for all tests. Thus, the smallest possible
 41 | downtime was about half of the minimum election timeout.
 42 | 
 43 | \begin{figure}
 44 | \centering
 45 | 
 46 | \begin{subfigure}{\textwidth}
 47 | \includegraphics{leaderelection/benchmarks-randomness}
 48 | \caption{
 49 | Time to elect new leader when varying the range of randomness in
 50 | election timeouts.
 51 | }
 52 | \label{fig:leaderelection:benchmark-randomness}
 53 | \end{subfigure}
 54 | 
 55 | \vspace{4ex}
 56 | 
 57 | \begin{subfigure}{\textwidth}
 58 | \includegraphics{leaderelection/benchmarks-scale}
 59 | \caption{
 60 | Time to elect new leader when scaling the minimum election timeout.
 61 | }
 62 | \label{fig:leaderelection:benchmark-scale}
 63 | \end{subfigure}
 64 | 
 65 | \vcaption[benchmark results on LAN cluster]{
 66 | The graphs show the time to detect and replace a crashed leader in the
 67 | real-world LAN benchmark. Each line represents \num{1000}
 68 | trials (except for 100 trials for ``\SIrange{150}{150}{\milli\second}'')
 69 | and corresponds to a
 70 | particular choice of election timeouts; for example,
 71 | ``\SIrange{150}{155}{\milli\second}''
 72 | means that election timeouts were chosen randomly and uniformly between
 73 | \SI{150}{\milli\second} and \SI{155}{\milli\second}.
 74 | The steps that appear on the graphs show when split votes occur (the
 75 | cluster must wait for another election timeout before a leader can be
 76 | elected).
 77 | The measurements were taken on a cluster of five servers with a
 78 | broadcast time (network round trip plus disk write) of roughly
 79 | \SI{15}{\milli\second}.
 80 | Results for a cluster of nine servers are similar.
 81 | }
 82 | \label{fig:leaderelection:benchmark}
 83 | \end{figure}
 84 | 
 85 | The benchmark tried to generate a worst-case scenario for leader
 86 | election. First, it synchronized the old leader's heartbeat RPCs before
 87 | causing the old leader to exit; this made the follower's election timers
 88 | start at approximately the same time, leading to many split votes if the
 89 | timeout values were not sufficiently randomized. Second, the servers in
 90 | each trial had different log lengths, so two of the four servers were not
 91 | eligible to become leader (however, Section~\ref{leaderelection:logsdiff}
 92 | will show that this has only a minor effect on election times).
 93 | 
 94 | Figure~\ref{fig:leaderelection:benchmark-randomness} shows that elections
 95 | complete in under one second when the timeout range is sufficiently
 96 | broad. A small amount of randomization in the election timeout is enough
 97 | to avoid split votes in elections. In the absence of randomness, leader
 98 | election consistently took longer than \SI{10}{seconds} due to many split
 99 | votes. Adding just \SI{5}{\milli\second} of randomness helps
100 | significantly, resulting in
101 | a median downtime of \SI{287}{\milli\second}.
102 | Using more randomness improves worst-case
103 | behavior: with a \SI{50}{\milli\second} random range, the worst-case
104 | completion time
105 | (over \num{1000} trials) was \SI{513}{\milli\second}.
106 | 
107 | Figure~\ref{fig:leaderelection:benchmark-scale} shows that downtime can
108 | be reduced by reducing the election timeout.
109 | With an election timeout of
110 | \SIrange{12}{24}{\milli\second}, it takes only
111 | \SI{35}{\milli\second} on average to elect a leader
112 | (the longest trial took \SI{152}{\milli\second}).
113 | However, lowering the timeouts beyond this point violates Raft's timing
114 | requirement: leaders have difficulty broadcasting heartbeats before
115 | other servers start new elections. This can cause unnecessary leader
116 | changes and lower overall system availability.
117 | We recommend using a conservative election timeout such as
118 | \SIrange{150}{300}{\milli\second};
119 | such timeouts are unlikely to cause unnecessary leader changes, result
120 | in a low rate of split votes, and will still provide good availability.
121 | 
122 | \subsubsection{Simulated WAN network}
123 | 
124 | We developed a simulator called AvailSim~\cite{availsim} to
125 | explore a wider range of leader election scenarios. Unlike the
126 | fixed network in our real-world test cluster, AvailSim allows the
127 | latency of the simulated network to be configured arbitrarily.
128 | (We used AvailSim to interactively explore a wide space of leader
129 | election scenarios and algorithms, but this chapter only includes a few
130 | relevant results.)
131 | 
132 | AvailSim is a close approximation to a complete Raft system, but its
133 | election time results differ from real elections in two ways:
134 | %
135 | \begin{enumerate}
136 | %
137 | \item Each server in AvailSim begins with a fresh election timer. In
138 | practice, the leader will crash at some random point in time between
139 | heartbeats. The election times produced by AvailSim are thus an average
140 | of half a heartbeat interval too large.
141 | %
142 | \item AvailSim does not add any time for processing messages or writing
143 | to disk (these are infinitely fast in the simulator). CPU time should be
144 | short relative to network latency, and disks need not play a significant
145 | role in leader election anyhow (see
146 | Section~\ref{leaderelection:split:total}).
147 | %
148 | \end{enumerate}
149 | 
150 | \begin{figure}[p]
151 | \centering
152 | \includegraphics{leaderelection/multi-submission-failures}
153 | \vspace{-4ex}
154 | \vcaption[election performance on a simulated WAN cluster]{
155 | Election performance as calculated by AvailSim for a WAN (one-way
156 | network latency of 
157 | \SIrange{30}{40}{\milli\second}). The figure shows a cluster of
158 | five servers with zero, one, and two servers having failed.
159 | \\
160 | The left graph plots the CDFs of election times. The right graph plots
161 | the same curves on a reverse-logarithmic $y$ axis to magnify
162 | detail on the tail of the distribution. Each CDF summarizes \num{10000}
163 | simulated elections. The point on each curve marks the average election
164 | time.
165 | }
166 | \label{fig:leaderelection:simulation:dist:submission-failures}
167 | \end{figure}
168 | 
169 | \begin{figure}[p]
170 | \centering
171 | \includegraphics{leaderelection/multi-submission-failures-logsdiff}
172 | \vspace{-4ex}
173 | \vcaption[election performance with differing logs]{
174 | Election performance as calculated by AvailSim when each server has a
175 | different log (using the same WAN configuration as
176 | Figure~\ref{fig:leaderelection:simulation:dist:submission-failures}).
177 | Performance is similar to
178 | Figure~\ref{fig:leaderelection:simulation:dist:submission-failures}, where
179 | the servers' logs are all the same.
180 | }
181 | \label{fig:leaderelection:simulation:dist:submission-failures-logsdiff}
182 | \end{figure}
183 | 
184 | We used AvailSim to approximate a WAN spanning the continental US. Each
185 | message was assigned a latency chosen randomly from the uniform range of
186 | \SIrange{30}{40}{\milli\second}, and the servers' election timeout range was set
187 | accordingly to \SIrange{300}{600}{\milli\second} (about 10--20 times the
188 | one-way network latency).
189 | 
190 | Figure~\ref{fig:leaderelection:simulation:dist:submission-failures} shows
191 | how quickly a five-server cluster elects a leader in this WAN
192 | environment. When only one of the five servers has failed, the average
193 | election completes within about
194 | \SI{475}{\milli\second}, and 99.9\% of
195 | elections complete within
196 | \SI{1.5}{\second}. Even when two of the five servers
197 | have failed, the average election takes about
198 | \SI{650}{\milli\second} (about 20
199 | times the one-way network latency), and 99.9\%
200 | of elections complete in
201 | \SI{3}{\second}. We believe these election times are more than adequate for
202 | most WAN deployments.
203 | 


--------------------------------------------------------------------------------
/leaderelection/splittotal.tex:
--------------------------------------------------------------------------------
  1 | \section{How fast will Raft elect a leader when split votes are possible?}
  2 | \label{leaderelection:split:total}
  3 | 
  4 | Given a split vote rate, we can estimate the total election time.
  5 | Raft will elect a leader as soon as an election term
  6 | successfully completes without a split vote. When a split vote occurs,
  7 | it's likely that all servers have reset their timers, since servers do
  8 | this when they grant a vote (this isn't quite true when logs
  9 | differ; see Section~\ref{leaderelection:logsdiff}). Thus, the
 10 | next election term has the same probability of success as an entirely
 11 | new election and will take just as long. In other words, each election
 12 | term is essentially memoryless, and the number of election terms
 13 | required in an election can be modeled as a geometric distribution,
 14 | where the probability of success is the probability that a split vote
 15 | does not occur. Therefore, Raft elections are expected to complete in
 16 | $\dfrac{1}{1-\text{split vote rate}}$ election terms on average.
 17 | 
 18 | If a split vote occurs in a particular election term, the election term
 19 | takes about $1+M_s$ time units plus a one-way network latency to reset
 20 | the server's election timers. We do not include the time for the
 21 | candidate to record its own vote on disk, since this time can be
 22 | overlapped with the RequestVote messages (with this optimization, the
 23 | candidate may not count its own vote towards leadership until the vote
 24 | is durably recorded). After the vote is split, the cluster must wait
 25 | another election timeout before the next election term begins. This
 26 | repeats for each split vote, then the time for an election with no split
 27 | votes (from Section~\ref{leaderelection:nosplit}) is additional. Thus, the
 28 | total time for an election, $E_s$, is:
 29 | \begin{align*}
 30 | E_s &= \Big(\sum_\text{split votes} \text{time for split vote}\Big)
 31 |  +
 32 |   \Big(\text{time for election with no split vote}\Big) \\
 33 | %
 34 | E_s &= \Big(\sum_\text{split votes} (1 + M_s + L)\Big)
 35 |  +
 36 |   \Big(1 + M_s + 2L + W - U(0,\dfrac{1}{2})\Big) & \\
 37 | %
 38 | \Ex[E_s] &= \Big((\frac{1}{1-\text{split vote rate}} - 1) \times
 39 |             (1 + \frac{1}{s+1} + \Ex[L])\Big)
 40 |  +
 41 |   \Big(1 + \frac{1}{s+1} + 2\Ex[L] + \Ex[W] - \dfrac{1}{4}\Big) \\
 42 | %
 43 | \Ex[E_s] &= \frac{1}{1-\text{split vote rate}} \times
 44 |             \Big(1 + \frac{1}{s+1} + \Ex[L]\Big)
 45 |             + \Ex[L] + \Ex[W] - \dfrac{1}{4}
 46 | \end{align*}
 47 | where $L$ is the one-way network latency and $W$ is the latency for a
 48 | durable disk write.
 49 | 
 50 | Howard~\cite{Howard:2014} suggests an optimization to decrease the time
 51 | for an election after split votes occur. The optimization separates
 52 | followers' timeouts from candidates' timeouts, where candidates select
 53 | smaller timeouts from a distribution with a smaller range. This results
 54 | in faster iterations once split votes have occurred, though it risks
 55 | additional split votes. The remainder of this chapter does not use this
 56 | optimization.
 57 | 
 58 | \begin{figure}
 59 | \centering
 60 | \includegraphics[height=5.5in]{leaderelection/overall}
 61 | \hspace{-2em}
 62 | \vcaption[expected overall election time]{
 63 | The expected total election times for various clusters,
 64 | as defined by $\Ex[E_s]$, with a fixed one-way network latency.
 65 | It excludes the time to write to stable
 66 | storage (which is usually negligible). The timeout range and
 67 | expected overall election time are presented as multiples of the one-way
 68 | network latency ($l$), since $l$ is typically fixed in a given
 69 | deployment.
 70 | }
 71 | \label{fig:leaderelection:theory:overall}
 72 | \end{figure}
 73 | 
 74 | Figure~\ref{fig:leaderelection:theory:overall} plots the expected time to
 75 | elect a leader when the network latency is fixed, by combining the
 76 | formula for $\Ex[E_s]$ with the formula for $Pr(D_{c,s} \leq l)$.
 77 | From the graphs, a Raft cluster with a sufficiently broad timeout range
 78 | will usually elect a leader within 20 times the one-way network latency,
 79 | even when running with a bare majority of available servers. This
 80 | suggests that most datacenter Raft deployments should be able to achieve
 81 | typical leader election times under \SI{100}{\milli\second}. Even worst
 82 | case global deployments, with one-way latencies of
 83 | \SI{200}{\milli\second},
 84 | should be able to typically elect leaders within \SI{4}{seconds}. (Election
 85 | times may be larger if some servers are deployed on other planets.)
 86 | 
 87 | Each of the curves has a knee. If the timeout range is chosen to be too
 88 | short, too many servers time out before others are able to collect
 89 | votes, resulting in poor election times. Once timeout ranges are
 90 | sufficiently large (about 3--8 times the network latency, depending on
 91 | the cluster), the curves become linear with a slight upward slope:
 92 | elections complete after few or no split votes, but they must wait
 93 | longer for each timeout to elapse.
 94 | 
 95 | The graphs provide insight into how to configure election timeouts: a
 96 | conservative setting is probably best in practice. The minimum point on
 97 | the graphs represents the best average election time possible for each
 98 | given cluster configuration. However, attaining this minimum time is
 99 | quite risky, since the minimum is close to the knee in the curve. If the
100 | network latency turns out to be slightly higher than anticipated in
101 | practice, that might push the system into the left region of the graph
102 | where election times skyrocket. It is better to configure systems
103 | farther to the right, trading off a slightly higher average election
104 | time in exchange for a more robust system. Thus, we recommend using a
105 | timeout range that is ten times the one-way network latency (even if the
106 | true network latency is five times greater than anticipated, most clusters would
107 | still be able to elect a leader in a timely manner).
108 | 


--------------------------------------------------------------------------------
/main.tex:
--------------------------------------------------------------------------------
  1 | % For book printing, change to:
  2 | % \documentclass[11pt,twoside]{report}
  3 | \documentclass[11pt]{report}
  4 | 
  5 | \usepackage{suthesis-2e-mod}
  6 | 
  7 | %% draw grid on page, useful for debugging margins
  8 | %\usepackage{pagegrid}
  9 | %\pagegridsetup{top-left,step=.5in}
 10 | 
 11 | \usepackage{epsfig}
 12 | \usepackage[hyphens]{url}
 13 | \usepackage{paralist}     % provides \compactitem
 14 | \usepackage{times}        % use times font
 15 | \usepackage{comment}      % provide \begin{comment}...\end{comment}
 16 | \usepackage{xcolor}       % commands for changing colors
 17 | \usepackage{lastpage}     % for total number of pages
 18 | \usepackage[normalem]{ulem} % for \sout{to strike out text}
 19 | \usepackage{stfloats}     % something to do with placement of
 20 |                           % two-column tables
 21 | \usepackage{etoolbox}     % for conditional toggles
 22 | \usepackage[pdfauthor={Diego Ongaro},
 23 |             pdftitle={Consensus: Bridging Theory and Practice},
 24 |             pdfsubject={Stanford University Ph.D. Dissertation},
 25 |             pagebackref,colorlinks=true,
 26 |             linkcolor=blue!50!black!90,
 27 |             citecolor=blue!50!black!90,
 28 |             urlcolor=blue!50!black!90,
 29 |             bookmarks]{hyperref}     % for clickable links in PDF
 30 | \usepackage{multicol}
 31 | \usepackage{tabularx}     % for easier tables with wrapped text
 32 | \usepackage{tabulary}     % for easier tables with wrapped text
 33 | \usepackage{relsize}      % for \mathlarger
 34 | \usepackage{amssymb}
 35 | \usepackage{amsmath}      % Needed to enable unnumbered equations (\begin{equation*})
 36 | 
 37 | \usepackage{placeins}     % provides \FloatBarrier
 38 | \usepackage{siunitx}      % typesetting units after numbers
 39 | \sisetup{range-phrase=--,
 40 |          range-units=single,
 41 |          binary-units,
 42 |          input-decimal-markers=.,
 43 |          group-separator={,},
 44 |          group-minimum-digits=4}
 45 | \usepackage{mathptmx}     % use times in math mode
 46 | \frenchspacing
 47 | 
 48 | \usepackage{caption}
 49 | 
 50 | % make links black for book printing
 51 | \ifbool{@twoside}{\hypersetup{hidelinks}}{}
 52 | 
 53 | % for subfigure environment, where refs look like Figure 9.8(c)
 54 | \usepackage[labelformat=simple]{subcaption}
 55 | \renewcommand\thesubfigure{(\alph{subfigure})}
 56 | 
 57 | \newcommand\mazieres{Mazi\`{e}res}
 58 | 
 59 | \title{Consensus: Bridging Theory and Practice}
 60 | \author{Diego Ongaro}
 61 | \principaladviser{John Ousterhout}
 62 | \firstreader{Mendel Rosenblum}
 63 | \secondreader{David \mazieres}
 64 | \submitdate{August 2014}
 65 | 
 66 | \def\thesiscopyrightpage{%
 67 | \urlstyle{same}
 68 | \null\vfill
 69 | \begin{center}
 70 | \large \copyright\ 2014 Diego Ongaro
 71 | \vspace{.75in}\\
 72 | \parbox{1.5in}{
 73 | \includegraphics[scale=1]{cc-by}
 74 | }
 75 | \parbox{3.5in}{
 76 | This work is licensed under the Creative Commons \\
 77 | Attribution 4.0 International License.\\
 78 | \url{http://creativecommons.org/licenses/by/4.0/}
 79 | }
 80 | 
 81 | \vspace{.75in}
 82 | \parbox{5.5in}{
 83 | This dissertation expands on a paper written by Diego Ongaro and John
 84 | Ousterhout entitled \emph{In Search of an Understandable Consensus
 85 | Algorithm}~\cite{raftatc}. Most of the paper's content is included in
 86 | some form in this dissertation. It is reproduced in this dissertation
 87 | and licensed under the Creative Commons Attribution license with
 88 | permission from John Ousterhout.
 89 | }
 90 | 
 91 | \vspace{1in}
 92 | This dissertation is distributed by Stanford University online:
 93 | \url{http://purl.stanford.edu/qr033xr6097}\\
 94 | The \LaTeX{} source files used to create this document are
 95 | available online:
 96 | \url{https://github.com/ongardie/dissertation/}
 97 | 
 98 | \end{center}
 99 | \vfill\newpage\urlstyle{tt}}
100 | 
101 | \long\def\signature#1{%
102 | \begin{flushright}
103 | \begin{minipage}{5in}
104 | \parindent=0pt
105 | I certify that I have read this dissertation and that, in my opinion,
106 | it is fully adequate in scope and quality as a dissertation for the degree
107 | of Doctor of Philosophy.
108 | \par
109 | \vspace{3ex}
110 | \hbox to 5in{\hfil\begin{tabular}{@{}l@{}}\textbf{#1}\end{tabular}}
111 | \end{minipage}
112 | \end{flushright}}
113 | 
114 | \long\def\ucgssignature#1{%
115 | \begin{flushright}
116 | \begin{minipage}{5in}
117 | \parindent=0pt
118 | Approved for the Stanford University Committee on Graduate Studies.
119 | \par
120 | \vspace{3ex}
121 | \hbox to 5in{\hfil\begin{tabular}{@{}l@{}}\textbf{#1}\end{tabular}}
122 | \end{minipage}
123 | \end{flushright}}
124 | 
125 | \def\signaturepage{%
126 | \signature{John Ousterhout, Principal Adviser}
127 | \vspace{.5in}
128 | \signature{Mendel Rosenblum}
129 | \vspace{.5in}
130 | \signature{David \mazieres}
131 | \vspace{.5in}
132 | \ucgssignature{Patricia J. Gumport, Vice Provost for Graduate Education}
133 | \vfill
134 | \begin{center}
135 | \emph{This signature page was generated electronically. An original signed
136 | hard\\ copy of the signature page is on file in Stanford University
137 | Archives.}
138 | \end{center}
139 | }
140 | 
141 | 
142 | \newcommand\name{Raft}
143 | 
144 | \newcommand\red[1]{\textcolor{red}{#1}}   % For newly proposed text
145 | \newcommand\redstrike[1]{\red{\sout{#1}}} % To strike out text
146 | \newcommand\blue[1]{\textcolor{blue}{XXX- #1}} % For comments
147 | 
148 | \newcommand{\Ex}{\mathop{\bf E\/}} % expected value
149 | 
150 | 
151 | \hyphenation{LogCabin} % force no hyphens in 'LogCabin'
152 | \hyphenation{RAMCloud} % force no hyphens in 'RAMCloud'
153 | \hyphenation{LevelDB} % force no hyphens in 'LevelDB'
154 | \hyphenation{Append-Entries} % break 'AppendEntries' between words
155 | \hyphenation{Request-Vote} % break 'RequestVote' between words
156 | 
157 | % Adjust caption display: bold figure name, slight margins
158 | \usepackage{caption}
159 | \captionsetup{labelfont=bf, margin=10pt}
160 | 
161 | % Spacing between footnotes and text.
162 | \setlength{\skip\footins}{6ex}
163 | 
164 | % Footnotes with no number.
165 | \makeatletter
166 | \def\blfootnoteindented{\xdef\@thefnmark{}\@footnotetext}
167 | \newcommand\blfootnote[1]{\blfootnoteindented{\hspace{-2.1em} #1}}
168 | \makeatother
169 | 
170 | % read environment variables
171 | \usepackage{catchfile}
172 | \newcommand{\getenv}[2][]{%
173 |   \CatchFileEdef{\temp}{"|kpsewhich --var-value #2"}{}%
174 |   \if\relax\detokenize{#1}\relax\temp\else\let#1\temp\fi}
175 | 
176 | % Trim margins if TRIM environment variable is set to "yes".
177 | \getenv[\TRIM]{TRIM}
178 | \edef\trimdef{{\TRIM}}
179 | \expandafter\ifstrequal\trimdef{yes }{
180 | \usepackage[paperwidth=6.6in, paperheight=9.35in, top=.75in, bottom=.5in, left=.3in, right=.3in]{geometry}
181 | }{
182 | }
183 | 
184 | % for proof
185 | \usepackage{color}
186 | \definecolor{boxshade}{gray}{0.85}
187 | \usepackage{proof/tlatex}
188 | \usepackage{amsthm}
189 | \theoremstyle{definition}
190 | \input{proof/defs.tex}
191 | 
192 | 
193 | % drop capitalization on List of Tables, List of Figures
194 | \renewcommand{\listfigurename}{List of figures}
195 | \renewcommand{\listtablename}{List of tables}
196 | 
197 | 
198 | \begin{document}
199 | 
200 | \beforepreface
201 | 
202 | \include{abstract/abstract}
203 | 
204 | \prefacesection{Preface}
205 | 
206 | Readers may want to refer to the Raft website~\cite{implementations} for
207 | videos about Raft and an interactive visualization of Raft.
208 | 
209 | \include{ack/ack}
210 | \afterpreface
211 | 
212 | % Do this \afterpreface to avoid breaking tex.
213 | % Stash chapter names in \Chaptername
214 | \let\Chaptermark\chaptermark
215 | \renewcommand\chaptermark[1]{
216 | \def\Chaptername{#1}\Chaptermark{#1}
217 | }
218 | 
219 | \newcommand{\vcaption}[2][Figure]{
220 |   \caption[\Chaptername: #1]{#2}
221 | }
222 | 
223 | % These terrible hacks make the chapter/appendix/bibliography name
224 | % appear on odd-sided pages rather than the section name. I prefer this
225 | % since some of my section names are too big to fit.
226 | \makeatletter
227 | \def\sectionmark#1{%
228 |   }
229 | \def\Chaptermark#1{%
230 |   \markboth {\MakeUppercase{%
231 |     \ifnum \c@secnumdepth >\m@ne
232 |         \@chapapp\ \thechapter. \ %
233 |     \fi
234 |     #1}}{\MakeUppercase{%
235 |     \ifnum \c@secnumdepth >\m@ne
236 |         \@chapapp\ \thechapter. \ %
237 |     \fi
238 |     #1}}}%
239 | \let\@mkboth\markboth
240 | \makeatother
241 | 
242 | 
243 | \newcommand\cold{$C_\text{old}$}
244 | \newcommand\cnew{$C_\text{new}$}
245 | \newcommand\cboth{$C_\text{old,new}$}
246 | 
247 | 
248 | \include{intro/intro}
249 | \include{motivation/motivation}
250 | \include{basicraft/raft}
251 | \include{membership/membership}
252 | \include{compaction/compaction}
253 | \include{clients/clients}
254 | \include{userstudy/userstudy}
255 | \include{correctness/correctness}
256 | \include{leaderelection/leaderelection}
257 | \include{performance/performance}
258 | \include{related/related}
259 | \include{conclusion/conclusion}
260 | 
261 | \appendix
262 | 
263 | \include{userstudymaterials/userstudyquizzes}
264 | \include{proof/proof}
265 | 
266 | \label{end}
267 | 
268 | \bibliographystyle{acmcaps}
269 | \bibliography{local}
270 | 
271 | \end{document}
272 | 


--------------------------------------------------------------------------------
/membership/Makefrag:
--------------------------------------------------------------------------------
 1 | FIGGENPDF := $(FIGGENPDF) \
 2 |   membership/special4to5.pdf \
 3 |   membership/special5to4.pdf \
 4 |   membership/special3to4.pdf \
 5 |   membership/special4to3.pdf \
 6 |   membership/catchupstart.pdf \
 7 |   membership/catchupend.pdf \
 8 |   membership/catchupone.pdf \
 9 |   membership/catchupmany.pdf
10 | 
11 | membership/special4to5.pdf: membership/special.svg
12 | 	inkscape -T -z -i 4to5 -A $@ $<
13 | membership/special5to4.pdf: membership/special.svg
14 | 	inkscape -T -z -i 5to4 -A $@ $<
15 | membership/special3to4.pdf: membership/special.svg
16 | 	inkscape -T -z -i 3to4 -A $@ $<
17 | membership/special4to3.pdf: membership/special.svg
18 | 	inkscape -T -z -i 4to3 -A $@ $<
19 | 
20 | membership/catchupstart.pdf: membership/catchup2.svg
21 | 	inkscape -T -z -i start -A $@ $<
22 | membership/catchupend.pdf: membership/catchup2.svg
23 | 	inkscape -T -z -i end -A $@ $<
24 | 
25 | membership/catchupone.pdf: membership/catchup3.svg
26 | 	inkscape -T -z -i one -A $@ $<
27 | membership/catchupmany.pdf: membership/catchup3.svg
28 | 	inkscape -T -z -i many -A $@ $<
29 | 


--------------------------------------------------------------------------------
/membership/arbitrary.tex:
--------------------------------------------------------------------------------
  1 | \section{Arbitrary configuration changes using joint consensus}
  2 | \label{membership:arbitrary}
  3 | 
  4 | This section presents a more complex approach to cluster membership
  5 | changes that handles arbitrary changes to the configuration at one time.
  6 | For example, two servers can be added to a cluster at once, or all of
  7 | the servers in a five-server cluster can be replaced at once. This was
  8 | the first approach to membership changes that we came up with, and it is
  9 | described only for completeness. Now that we know about the simpler
 10 | single-server approach, we recommend that one instead, since handling
 11 | arbitrary changes requires extra complexity. Arbitrary changes are
 12 | typically the way membership changes are assumed to operate in the
 13 | literature, but we don't think this flexibility is needed in real
 14 | systems, where a series of single-server changes can change the cluster
 15 | membership to any desired configuration. 
 16 | 
 17 | 
 18 | To ensure safety across arbitrary configuration changes,
 19 | the cluster first switches to a transitional
 20 | configuration we call \emph{joint consensus}; once the joint consensus
 21 | has been committed, the system then transitions to the new
 22 | configuration. The joint consensus
 23 | combines both the old and new configurations:
 24 | \begin{itemize}
 25 | \item Log entries are replicated to all servers in both configurations.
 26 | \item Any server from either configuration may serve as leader.
 27 | \item Agreement (for elections and entry commitment) requires
 28 | separate majorities from \emph{both} the old and new configurations.
 29 | For example, when changing from a cluster of 3 servers to a
 30 | different cluster of 9 servers, agreement requires both 2 of
 31 | the 3 servers in the old configuration and 5 of the 9 servers in the new
 32 | configuration.
 33 | \end{itemize}
 34 | The joint consensus allows individual
 35 | servers to transition between configurations at different times
 36 | without compromising safety. Furthermore, joint
 37 | consensus allows the cluster to continue servicing client requests
 38 | throughout the configuration change.
 39 | 
 40 | \begin{figure}
 41 | \centering
 42 | \includegraphics[scale=.50]{membership/reconfigurationconf}
 43 | \vcaption[joint consensus timeline]{
 44 | Timeline for a configuration change using joint consensus. Dashed lines show configuration
 45 | entries that have been created but not committed, and solid lines
 46 | show the latest committed configuration entry. The leader first creates
 47 | the \cboth{} configuration entry in its log and commits it to \cboth{}
 48 | (a majority of \cold{} and a majority of \cnew{}).
 49 | Then it creates the \cnew{} entry and commits it to a majority of
 50 | \cnew{}. There is no point in time in which \cold{} and \cnew{} can
 51 |  both make decisions independently.
 52 | }
 53 | \label{fig:membership:reconfiguration}
 54 | \end{figure}
 55 | 
 56 | This approach extends the single-server membership change algorithm with an
 57 | intermediate log entry for the joint configuration;
 58 | Figure~\ref{fig:membership:reconfiguration} illustrates the
 59 | process. When the leader receives a request to
 60 | change the configuration from \cold{} to \cnew{}, it stores the
 61 | configuration for joint consensus (\cboth{} in the figure) as a log
 62 | entry and replicates that entry using the normal Raft mechanism. As with
 63 | the single-server configuration change algorithm, each server starts
 64 | using a new configuration as soon as it stores the configuration in its
 65 | log. This means that
 66 | the leader will use the rules of \cboth{} to determine when the log
 67 | entry for \cboth{} is committed. If the leader crashes, a new
 68 | leader may be chosen under either \cold{} or \cboth{}, depending
 69 | on whether the winning candidate has received \cboth{}.  In any
 70 | case, \cnew{} cannot make unilateral decisions during this period.
 71 | 
 72 | Once \cboth{} has been committed, neither \cold{} nor \cnew{}
 73 | can make decisions without approval of the other, and
 74 | the Leader Completeness Property ensures that only servers with the
 75 | \cboth{} log entry can be elected as leader.
 76 | It is now safe for the
 77 | leader to create a log entry describing \cnew{} and replicate it
 78 | to the cluster. Again, this configuration will take effect on
 79 | each server as soon as it is seen. When the \cnew{} log entry
 80 | has been committed under the rules of \cnew{}, the old configuration
 81 | is irrelevant and servers not in the
 82 | new configuration can be shut down. As shown in
 83 | Figure~\ref{fig:membership:reconfiguration},
 84 | there is no time when \cold{} and \cnew{} can both make
 85 | unilateral decisions; this guarantees safety.
 86 | 
 87 | 
 88 | The joint consensus approach could be generalized to allow a
 89 | configuration change to begin while a prior change was still in
 90 | progress. However, there would not be much practical advantage to doing
 91 | this. Instead, a leader rejects additional configuration changes when a
 92 | configuration change is already in progress (when its latest
 93 | configuration is not committed or is not a simple majority). Changes
 94 | that are rejected in this way can simply wait and try again later.
 95 | 
 96 | This joint consensus approach is more complex than the single-server
 97 | changes precisely because it requires transitioning to and from an
 98 | intermediate configuration. Joint configurations also require changes to
 99 | how all voting and commitment decisions are made; instead of simply
100 | counting servers, the leader must check if the servers form a majority
101 | of the old cluster and also form a majority of the new cluster.
102 | Implementing this required finding and changing about six comparisons in
103 | our Raft implementation~\cite{logcabin}.
104 | 


--------------------------------------------------------------------------------
/membership/cheatsheet2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongardie/dissertation/0c10ea1b53d73bf6d163bdde8e4141a1cbd8f36a/membership/cheatsheet2.pdf


--------------------------------------------------------------------------------
/membership/membership.tex:
--------------------------------------------------------------------------------
  1 | \chapter{Cluster membership changes}
  2 | \label{membership}
  3 | 
  4 | Up until now we have assumed that the cluster \emph{configuration}
  5 | (the set of servers participating in the consensus algorithm) is fixed.
  6 | In practice, it will occasionally be necessary to change the
  7 | configuration, for example to replace servers when they fail or to
  8 | change the degree of replication. This could be done
  9 | manually, using one of two approaches:
 10 | \begin{itemize}
 11 | \item Configuration changes could be done by taking the entire cluster
 12 | off-line, updating configuration files, and then restarting the cluster.
 13 | However, this would leave the cluster unavailable during the changeover.
 14 | \item Alternatively, a new server could replace a cluster member by
 15 | acquiring its network address. However, the administrator must guarantee
 16 | that the replaced server will never come back up, or else the system
 17 | would lose its safety properties (for example, there would be an extra
 18 | vote).
 19 | \end{itemize}
 20 | Both of these approaches to membership changes have significant downsides,
 21 | and if there are any manual steps, they risk operator error.
 22 | 
 23 | \begin{figure}
 24 | \centering
 25 | \includegraphics[scale=0.95]{membership/cheatsheet2}
 26 | \vcaption[RPCs to change cluster membership]{
 27 | RPCs used to change cluster membership. The
 28 | AddServer RPC is used to add a new server to the current configuration,
 29 | and the RemoveServer RPC is used to remove a server from the current
 30 | configuration.
 31 | Section numbers such as \S\ref{membership:safety} indicate where
 32 | particular features are discussed.
 33 | Section~\ref{membership:system} discusses ways to use
 34 | these RPCs in a complete system.
 35 | }
 36 | \label{fig:membership:cheatsheet2}
 37 | \end{figure}
 38 | 
 39 | 
 40 | In order to avoid these issues, we decided to automate configuration
 41 | changes and incorporate them into the \name{} consensus algorithm. Raft
 42 | allows the cluster to continue operating normally during changes, and
 43 | membership changes can be implemented with only a few extensions to the
 44 | basic consensus algorithm. Figure~\ref{fig:membership:cheatsheet2}
 45 | summarizes the RPCs used to change cluster membership, whose elements
 46 | are described in the remainder of this chapter.
 47 | 
 48 | 
 49 | \input{membership/safety}
 50 | \input{membership/availability}
 51 | \input{membership/arbitrary}
 52 | 
 53 | \section{System integration}
 54 | \label{membership:system}
 55 | 
 56 | Raft implementations may expose the cluster membership change mechanism
 57 | described in this chapter in different ways. For example, the AddServer
 58 | and RemoveServer RPCs in Figure~\ref{fig:membership:cheatsheet2} can be
 59 | invoked by administrators directly, or they can be invoked by a script
 60 | that uses a series of single-server steps to change the configuration in
 61 | arbitrary ways.
 62 | 
 63 | It may be desirable to invoke membership changes automatically in
 64 | response to events like server failures. However, this should only be
 65 | done according to a reasonable policy. For example, it can be dangerous
 66 | for the cluster to automatically remove failed servers, as it could then
 67 | be left with too few replicas to satisfy the intended durability and
 68 | fault-tolerance requirements. One reasonable approach is to have the
 69 | system administrator configure a desired cluster size, and within that
 70 | constraint, available servers could automatically replace failed
 71 | servers.
 72 | 
 73 | When making cluster membership changes that require multiple
 74 | single-server steps, it is preferable to add servers before removing
 75 | servers. For example, to replace a server in a three-server cluster,
 76 | adding one server and then removing the other allows the system to
 77 | handle one server failure at all times throughout the process. However,
 78 | if one server was first removed before the other was added, the system
 79 | would temporarily not be able to mask any failures (since two-server
 80 | clusters require both servers to be available).
 81 | 
 82 | Membership changes motivate a different approach to bootstrapping a
 83 | cluster. Without dynamic membership, each server simply has a static
 84 | file listing the configuration. With dynamic membership changes, the
 85 | static configuration file is no longer needed, since the system manages
 86 | configurations in the Raft log; it is also potentially error-prone
 87 | (e.g., with which configuration should a new server be initialized?).
 88 | Instead, we recommend that the very first time a cluster is created, one
 89 | server is initialized with a configuration entry as the first entry in
 90 | its log. This configuration lists only that one server; it alone forms a
 91 | majority of its configuration, so it can consider this configuration
 92 | committed. Other servers from then on should be initialized with empty
 93 | logs; they are added to the cluster and learn of the current
 94 | configuration through the membership change mechanism.
 95 | 
 96 | Membership changes also necessitate a dynamic approach for clients to
 97 | find the cluster; this is discussed in Chapter~\ref{clients}.
 98 | 
 99 | \section{Conclusion}
100 | 
101 | This chapter described an extension to Raft for handling cluster
102 | membership changes automatically. This is an important part of a
103 | complete consensus-based system, since fault-tolerance requirements
104 | can change over time, and failed servers eventually need to be replaced.
105 | 
106 | The consensus algorithm must fundamentally be involved in preserving
107 | safety across configuration changes, since a new configuration affects
108 | the meaning of ``majority''. This chapter presented a simple approach
109 | that adds or removes a single server at a time. These operations preserve
110 | safety simply, since at least one server overlaps any majority during
111 | the change. Multiple single-server changes may be composed to modify the
112 | cluster more drastically. Raft allows the cluster to continue operating
113 | normally during membership changes.
114 | 
115 | Preserving availability during configuration changes requires handling
116 | several non-trivial issues. In particular, the issue of a server not in
117 | the new configuration disrupting valid cluster leaders was surprisingly
118 | subtle; we struggled with several insufficient solutions based on log
119 | comparisons before settling on a working solution based on heartbeats.
120 | 


--------------------------------------------------------------------------------
/membership/safety.tex:
--------------------------------------------------------------------------------
  1 | \section{Safety}
  2 | \label{membership:safety}
  3 | 
  4 | Preserving safety is the first challenge for configuration changes.
  5 | For the mechanism to be safe,
  6 | there must be no point during the transition where it is possible for
  7 | two leaders to be elected for the same term. If a single configuration
  8 | change adds or removes many servers, switching the cluster directly from
  9 | the old configuration to the new configuration can be unsafe;
 10 | it isn't possible to atomically switch all of the servers at once, so 
 11 | the cluster can potentially split into two independent majorities
 12 | during the transition (see
 13 | Figure~\ref{fig:membership:reconfigurationdifficulty}).
 14 | 
 15 | \begin{figure}
 16 | \centering
 17 | \includegraphics[scale=.50]{membership/reconfigurationdifficulty}
 18 | \vcaption[safety challenge]{
 19 | Switching directly from one configuration to another can be
 20 | unsafe because different servers will switch at different times.
 21 | In this example, the cluster grows from three servers to five.
 22 | Unfortunately, there is a point in time where two different leaders
 23 | can be elected for the same term,
 24 | one with a majority of the old
 25 | configuration (\cold{}) and another with a majority of the new
 26 | configuration (\cnew{}).
 27 | }
 28 | \label{fig:membership:reconfigurationdifficulty}
 29 | \end{figure}
 30 | 
 31 | \begin{figure}
 32 | \centering
 33 | 
 34 | \begin{subfigure}{.45\textwidth}
 35 | \centering
 36 | \includegraphics[scale=0.50]{membership/special4to5}
 37 | \caption{
 38 | Adding one server to a 4-server cluster.
 39 | }
 40 | \end{subfigure}
 41 | ~
 42 | \begin{subfigure}{.45\textwidth}
 43 | \centering
 44 | \includegraphics[scale=0.50]{membership/special3to4}
 45 | \caption{
 46 | Adding one server to a 3-server cluster.
 47 | }
 48 | \end{subfigure}
 49 | 
 50 | \vspace{3ex}
 51 | 
 52 | \begin{subfigure}{.45\textwidth}
 53 | \centering
 54 | \includegraphics[scale=0.50]{membership/special5to4}
 55 | \caption{
 56 | Removing one server from a 5-server cluster.
 57 | }
 58 | \end{subfigure}
 59 | ~
 60 | \begin{subfigure}{.45\textwidth}
 61 | \centering
 62 | \includegraphics[scale=0.50]{membership/special4to3}
 63 | \caption{
 64 | Removing one server from a 4-server cluster.
 65 | }
 66 | \end{subfigure}
 67 | 
 68 | \vcaption[adding/removing one server maintains overlap]{
 69 | The addition and removal of a single server from an even- and an
 70 | odd-sized cluster.
 71 | In each figure,
 72 | the blue rectangle shows a majority of the old cluster, and the red
 73 | rectangle shows a majority of the new cluster.
 74 | In every single-server membership change, an overlap between any majority
 75 | of the old cluster and any majority of the new cluster is preserved,
 76 | as needed for safety. For example in (b), a majority of the old cluster
 77 | must include two of the left three servers, and a majority of the new
 78 | cluster must include three of the servers in the new cluster, of which
 79 | at least two must come from the old cluster.
 80 | }
 81 | \label{fig:membership:special}
 82 | \end{figure}
 83 | 
 84 | Most membership change algorithms introduce additional mechanism to deal
 85 | with such problems. This is what we did for Raft initially, but we later
 86 | discovered a simpler approach, which is to disallow membership changes
 87 | that could result in disjoint majorities. Thus, Raft restricts the types
 88 | of changes that are allowed: only one server can be added or removed
 89 | from the cluster at a time. More complex changes in membership are
 90 | implemented as a series of single-server changes. Most of this chapter
 91 | describes the single-server approach, which is easier to understand than
 92 | our original approach. For completeness,
 93 | Section~\ref{membership:arbitrary} describes the original approach,
 94 | which incurs additional complexity to handle arbitrary configuration
 95 | changes. We implemented the more complex approach in LogCabin prior to
 96 | discovering the simpler single-server change approach; it still uses the
 97 | more complex approach at the time of this writing.
 98 | 
 99 | When adding a single server to a cluster or removing a single server
100 | from a cluster, any majority of the old cluster overlaps with any
101 | majority of the new cluster; see Figure~\ref{fig:membership:special}.
102 | This overlap prevents the cluster from splitting into two independent
103 | majorities; in terms of the safety argument of
104 | Section~\ref{basicraft:safety:argument}, it guarantees the existence of
105 | ``the voter''. Thus, when adding or removing just a single server, it is
106 | safe to switch directly to the new configuration. Raft exploits this
107 | property to change cluster membership safely using little additional
108 | mechanism.
109 | 
110 | Cluster configurations are stored and communicated using special entries
111 | in the replicated log.
112 | This leverages the existing mechanisms in Raft to
113 | replicate and persist configuration information.
114 | It also allows the cluster to continue to service
115 | client requests while configuration changes are in progress,
116 | by imposing ordering between
117 | configuration changes and client requests (while allowing both to be
118 | replicated concurrently in a pipeline and/or in batches).
119 | 
120 | When the leader receives a request to add or remove a server from its
121 | current configuration (\cold{}), it appends the new configuration
122 | (\cnew{}) as an entry in its log and replicates that entry using the
123 | normal Raft mechanism. The new configuration takes effect on each server
124 | as soon as it is added to that server's log: the \cnew{} entry is
125 | replicated to the \cnew{} servers, and a majority of the new
126 | configuration is used to determine the \cnew{} entry's commitment. This
127 | means that servers do not wait for configuration entries to be
128 | committed, and each server always uses the latest configuration found in
129 | its log.
130 | 
131 | The configuration change is complete once the \cnew{} entry is
132 | committed. At this point, the leader knows that a majority of the
133 | \cnew{} servers have adopted \cnew{}. It also knows that any servers
134 | that have not moved to \cnew{} can no longer form a majority of the
135 | cluster, and servers without \cnew{} cannot be elected leader.
136 | Commitment of \cnew{} allows three things to continue:
137 | %
138 | \begin{enumerate}
139 | %
140 | \item The leader can acknowledge the successful completion of the
141 | configuration change.
142 | %
143 | \item If the configuration change removed a server, that server can be
144 | shut down.
145 | %
146 | \item Further configuration changes can be started. Before this point,
147 | overlapped configuration changes could degrade to unsafe situations
148 | like the one in Figure~\ref{fig:membership:reconfigurationdifficulty}.
149 | %
150 | \end{enumerate}
151 | 
152 | As stated above, servers always use the latest configuration in their
153 | logs, regardless of whether that configuration entry has been committed.
154 | This allows leaders to easily avoid overlapping configuration changes
155 | (the third item above), by not beginning a new change until the previous
156 | change's entry has committed. It is only safe to start another membership
157 | change once a majority of the old cluster has moved to operating under
158 | the rules of \cnew{}. If servers adopted \cnew{} only when they
159 | learned that \cnew{} was committed, Raft leaders would have a difficult
160 | time knowing when a majority of the old cluster had adopted it. They
161 | would need to track which servers know of the entry's commitment, and
162 | the servers would need to persist their commit index to disk; neither of
163 | these mechanisms is required in Raft. Instead, each server
164 | adopts \cnew{} as soon as that entry exists in its log, and the leader
165 | knows it's safe to allow further configuration changes as soon as the
166 | \cnew{} entry has been committed. Unfortunately, this decision does
167 | imply that a log entry for a configuration change can be removed (if
168 | leadership changes); in this case, a server must be prepared to fall
169 | back to the previous configuration in its log.
170 | 
171 | In Raft, it is the caller's configuration that is used in reaching
172 | consensus, both for voting and for log replication:
173 | %
174 | \begin{itemize}
175 | %
176 | \item A server accepts AppendEntries requests from a leader that
177 | is not part of the server's latest configuration. Otherwise, a new server
178 | could never be added to the cluster (it would never accept any log entries
179 | preceding the configuration entry that adds the server).
180 | %
181 | \item A server also grants its vote to a candidate that is not
182 | part of the server's latest configuration (if the candidate has a
183 | sufficiently up-to-date log and a current term). This vote may
184 | occasionally be needed to keep the cluster available. For example,
185 | consider adding a fourth server to a three-server cluster. If one server
186 | were to fail, the new server's vote would be needed to form a majority
187 | and elect a leader.
188 | %
189 | \end{itemize}
190 | %
191 | Thus, servers process incoming RPC requests without consulting their
192 | current configurations.
193 | 
194 | 


--------------------------------------------------------------------------------
/motivation/motivation.tex:
--------------------------------------------------------------------------------
 1 | \chapter{Motivation}
 2 | \label{motivation}
 3 | 
 4 | Consensus is a fundamental problem in fault-tolerant systems: how can
 5 | servers reach agreement on shared state, even in the face of failures?
 6 | This problem arises in a wide variety of systems that need to provide
 7 | high levels of availability and cannot compromise on consistency; thus,
 8 | consensus is used in virtually all consistent large-scale storage
 9 | systems. Section~\ref{motivation:problem} describes how consensus is
10 | typically used to create replicated state machines, a general-purpose
11 | building block for fault-tolerant systems; Section~\ref{motivation:uses}
12 | discusses various ways replicated state machines are used in larger
13 | systems; and Section~\ref{motivation:paxos} discusses the problems with
14 | the Paxos consensus protocol, which Raft aims to address.
15 | 
16 | \input{motivation/problem}
17 | \input{motivation/uses}
18 | \input{motivation/paxos}
19 | 


--------------------------------------------------------------------------------
/motivation/paxos.tex:
--------------------------------------------------------------------------------
  1 | \section{What's wrong with Paxos?}
  2 | \label{motivation:paxos}
  3 | 
  4 | \begin{figure*}
  5 | \centering
  6 | \includegraphics[scale=0.95]{motivation/paxossummary}
  7 | \vcaption[summary of the single-decree Paxos protocol]{
  8 | Summary of the single-decree Paxos consensus protocol.
  9 | See \cite{Lamport:2001} for a detailed explanation.
 10 | }
 11 | \label{fig:motivation:paxos:basic}
 12 | \end{figure*}
 13 | 
 14 | 
 15 | Over the last ten years, Leslie Lamport's Paxos protocol~\cite{Lamport:1998}
 16 | has become almost synonymous with consensus: it is the protocol
 17 | most commonly taught in courses, and most implementations of consensus
 18 | use it as a starting point. Paxos first defines a protocol capable
 19 | of reaching agreement on a single decision, such as a single replicated
 20 | log entry.  We refer to this subset as \emph{single-decree Paxos}.
 21 | Paxos then combines multiple instances of this protocol to facilitate
 22 | a series of decisions such as a log (\emph{Multi-Paxos}).
 23 | Single-decree Paxos is summarized in
 24 | Figure~\ref{fig:motivation:paxos:basic}, and Multi-Paxos is summarized
 25 | in Figure~\ref{fig:appendix:userstudy:paxossummary4}.
 26 | Paxos ensures safety and liveness (it eventually reaches consensus,
 27 | assuming an adequate failure detector
 28 | is used to avoid proposer livelock), and its correctness has been proven.
 29 | Multi-Paxos is efficient in the normal case, and Paxos supports changes
 30 | in cluster membership~\cite{Lorch:2006}.
 31 | 
 32 | Unfortunately, Paxos has two significant drawbacks.  The first drawback is
 33 | that Paxos is exceptionally difficult to understand. The full
 34 | explanation~\cite{Lamport:1998} is notoriously opaque; few
 35 | people succeed in understanding it, and only with great effort.
 36 | As a result, there have been several attempts to explain Paxos
 37 | in simpler terms~\cite{Lamport:2001, Lampson:1996, Lampson:2001}.
 38 | These explanations focus on the single-decree subset,
 39 | yet they are still challenging.
 40 | In an informal survey of attendees at NSDI 2012, we found few people who
 41 | were comfortable with Paxos, even among seasoned researchers.
 42 | We struggled with Paxos ourselves; we were not able to understand
 43 | the complete protocol
 44 | until after reading several explanations
 45 | and designing our own alternative protocol, a process that took
 46 | almost a year.
 47 | 
 48 | We hypothesize that Paxos' opaqueness stems from its choice of the
 49 | single-decree subset as its foundation. Single-decree
 50 | Paxos is dense and subtle: it is divided into two stages that do
 51 | not have simple intuitive explanations and cannot be understood
 52 | independently. Because of this, it is difficult to
 53 | develop intuitions about why the single-decree protocol works.
 54 | The composition rules for Multi-Paxos add significant additional
 55 | complexity and subtlety. We believe that the overall
 56 | problem of reaching consensus on multiple decisions (i.e., a log instead
 57 | of a single entry) can be decomposed in other ways that are more
 58 | direct and obvious.
 59 | 
 60 | The second problem with Paxos is that it does not provide a good
 61 | foundation
 62 | for building practical implementations. One reason is that
 63 | there is no widely agreed-upon algorithm for Multi-Paxos.
 64 | Lamport's descriptions are mostly about single-decree Paxos;
 65 | he sketched possible approaches to Multi-Paxos, but many
 66 | details are missing. There have been several attempts to flesh out and
 67 | optimize Paxos, such as \cite{Mazieres:2007}, \cite{Renesse:2011},
 68 | and \cite{Kirsch:2008},
 69 | but these differ from each other and from Lamport's sketches.
 70 | Systems such as Chubby~\cite{Chandra:2007} have implemented
 71 | Paxos-like algorithms, but in most cases their details have not been
 72 | published.
 73 | 
 74 | 
 75 | Furthermore, the Paxos
 76 | architecture is a poor one
 77 | for building practical systems; this
 78 | is another consequence of the
 79 | single-decree decomposition. For example, there is
 80 | little benefit to
 81 | choosing a collection of log entries independently and then melding
 82 | them into a sequential log; this just adds complexity.  It is simpler
 83 | and more efficient to design a system around a log, where new
 84 | entries are appended sequentially in a constrained order.
 85 | Another problem is that Paxos
 86 | uses a symmetric peer-to-peer approach at its core (though it
 87 | also suggests a weak form of leadership as a performance
 88 | optimization). This makes
 89 | sense in a simplified world where only one decision will be made,
 90 | but few practical systems use this approach. If a series of decisions
 91 | must be made, it is simpler and faster to first elect a
 92 | leader, then have the leader coordinate the decisions.
 93 | (Chapter~\ref{related} discusses Egalitarian Paxos, a recent
 94 | variant of Paxos that does not use a leader but in some situations can
 95 | be more efficient than algorithms that do; however, this algorithm is
 96 | much more complex than leader-based algorithms.)
 97 | 
 98 | As a result, practical systems bear little resemblance to Paxos.
 99 | Each implementation begins with Paxos, discovers the difficulties in
100 | implementing it, and then develops a significantly different architecture.
101 | This is time-consuming and error-prone, and the difficulties of
102 | understanding Paxos exacerbate the problem.
103 | Paxos' formulation may be a good one for proving theorems about
104 | its correctness, but real implementations are so
105 | different from Paxos that the proofs have little value. The following
106 | comment from the Chubby implementers is typical:
107 | 
108 | {\defaultleftmargin{4em}{}{}{}
109 | \begin{quote}
110 | There are significant gaps between the description of the Paxos
111 | algorithm and the needs of a real-world system\dots. the final system
112 | will be based on an unproven protocol~\cite{Chandra:2007}.
113 | \end{quote}
114 | }
115 | 
116 | Because of these problems, we concluded that Paxos does not provide
117 | a good foundation either for system building or for education.
118 | Given the importance of consensus in large-scale software
119 | systems, we decided to see if we could design an alternative consensus
120 | algorithm with better properties than Paxos.  \name{} is the result
121 | of that experiment.
122 | 
123 | 
124 | %
125 | 
126 | 


--------------------------------------------------------------------------------
/motivation/paxossummary.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongardie/dissertation/0c10ea1b53d73bf6d163bdde8e4141a1cbd8f36a/motivation/paxossummary.pdf


--------------------------------------------------------------------------------
/motivation/problem.tex:
--------------------------------------------------------------------------------
 1 | \section{Achieving fault tolerance with replicated state machines}
 2 | \label{motivation:problem}
 3 | 
 4 | Consensus algorithms typically arise in the context of
 5 | \emph{replicated state machines}~\cite{Schneider:1990}. In this approach, state
 6 | machines on a collection of servers compute identical copies of
 7 | the same state and can continue operating even if some of the
 8 | servers are down.
 9 | Replicated state machines are used to solve a variety of
10 | fault tolerance problems in
11 | distributed systems, as described in
12 | Section~\ref{motivation:uses}.
13 | Examples of replicated state machines include Chubby~\cite{Burrows:2006}
14 | and ZooKeeper~\cite{Hunt:2010},
15 | which both provide hierarchical key-value stores for small
16 | amounts of configuration data. In addition to basic operations such as
17 | \emph{get} and \emph{put}, they also provide synchronization primitives
18 | like \emph{compare-and-swap}, enabling concurrent clients to coordinate
19 | safely.
20 | 
21 | \begin{figure}
22 | \centering
23 | \includegraphics[scale=.50]{motivation/statemachine}
24 | \vcaption[replicated state machine architecture]{
25 | Replicated state machine architecture.
26 | The consensus algorithm
27 | manages a replicated log containing state machine commands from
28 | clients. The state machines process identical sequences of commands
29 | from the logs, so they produce the same outputs.}
30 | \label{fig:motivation:statemachine}
31 | \end{figure}
32 | 
33 | Replicated state machines are typically implemented using a replicated
34 | log, as shown in Figure~\ref{fig:motivation:statemachine}. Each server stores a log
35 | containing a series of commands, which its state machine executes in order.
36 | Each log contains the same commands in the same order, so each state
37 | machine processes the same sequence of commands. Since the state
38 | machines are deterministic, each computes the same state and the same
39 | sequence of outputs.
40 | 
41 | Keeping the replicated log consistent is the job of the consensus
42 | algorithm. The consensus
43 | module on a server receives commands from clients and adds them to its log.
44 | It communicates with the consensus modules on other servers to
45 | ensure that every log eventually contains
46 | the same requests in the
47 | same order, even if some servers fail. Once commands are properly
48 | replicated, they are said to be \emph{committed}. Each server's state machine processes
49 | committed commands in log order, and the outputs are returned to clients.
50 | As a result, the servers appear to
51 | form a single, highly reliable state machine.
52 | 
53 | Consensus algorithms for practical systems typically have the following
54 | properties:
55 | \begin{itemize}
56 | \item They ensure \emph{safety} (never returning an incorrect
57 | result) under all non-Byzantine
58 | conditions, including network delays,
59 | partitions, and packet loss, duplication, and reordering.
60 | \item They are
61 | fully functional (\emph{available}) as long as any majority of the servers are
62 | operational and can communicate with each other and with clients.
63 | Thus, a typical cluster of five servers can tolerate the
64 | failure of any two servers.
65 | Servers are assumed to fail by stopping; they may later recover from
66 | state on stable storage and rejoin the cluster.
67 | \item They do not depend on timing to ensure the consistency
68 | of the logs: faulty clocks and extreme message delays can, at worst,
69 | cause availability problems. That is, they maintain safety under an
70 | \emph{asynchronous} model~\cite{Lynch:1996}, in
71 | which messages and processors proceed at arbitrary speeds.
72 | \item In the common
73 | case, a command can complete as soon as a majority of the cluster
74 | has responded to a single round of remote procedure calls; a minority
75 | of slow servers need not impact overall system performance.
76 | \end{itemize}
77 | 
78 | 


--------------------------------------------------------------------------------
/motivation/uses.tex:
--------------------------------------------------------------------------------
 1 | \section{Common use cases for replicated state machines}
 2 | \label{motivation:uses}
 3 | 
 4 | Replicated state machines are a general-purpose building block for making
 5 | systems fault-tolerant. They can be used in a variety of ways, and this
 6 | section discusses some typical usage patterns.
 7 | 
 8 | \begin{figure}
 9 | \hfill
10 | \begin{subfigure}{.45\textwidth}
11 | \centering
12 | \includegraphics[scale=.5]{motivation/activeactive}
13 | \caption{
14 | The nodes in the cluster coordinate among themselves by reading from and
15 | writing to the replicated state machine.
16 | \\
17 | }
18 | \label{fig:motivation:activeactive}
19 | \end{subfigure}
20 | \hfill
21 | \begin{subfigure}{.45\textwidth}
22 | \centering
23 | \includegraphics[scale=.5]{motivation/activepassive}
24 | \caption{
25 | One leader actively manages the nodes in the cluster and records its
26 | state using the replicated state machine. Other standby servers are
27 | passive until the leader fails.
28 | }
29 | \label{fig:motivation:activepassive}
30 | \end{subfigure}
31 | \hfill
32 | \vcaption[common patterns for using a single replicated state machine]{
33 | Common patterns for using a single replicated state machine.
34 | }
35 | \end{figure}
36 | 
37 | 
38 | Most common deployments of consensus have just three or five servers
39 | forming one replicated state machine. Other servers can then use this
40 | state machine to coordinate their activities, as shown in
41 | Figure~\ref{fig:motivation:activeactive}. These systems often use the
42 | replicated state machine to provide
43 | group membership, configuration management, or locks~\cite{Hunt:2010}.
44 | As a more specific example, the replicated state machine could provide a
45 | fault-tolerant work queue, and other servers could coordinate using the
46 | replicated state machine to assign work to themselves.
47 | 
48 | A common simplification to this usage is shown in
49 | Figure~\ref{fig:motivation:activepassive}. In this pattern, one
50 | server acts as leader, managing the rest of the servers.
51 | The leader stores its critical data in the consensus system.
52 | In case it fails, other standby servers compete for the position of
53 | leader, and if they succeed, they use the data in the consensus system
54 | to continue operations.
55 | Many large-scale storage systems that have a single cluster leader, such as
56 | GFS~\cite{Ghemawat:2003}, HDFS~\cite{Shvachko:2010}, and
57 | RAMCloud~\cite{Ousterhout:2011}, use this approach.
58 | 
59 | \begin{figure}
60 | \centering
61 | \includegraphics[scale=.5]{motivation/bigdata}
62 | \vcaption[partitioned large-scale storage system using consensus]{
63 | Partitioned large-scale storage system using consensus.
64 | For scale, data is partitioned across many replicated state machines.
65 | Operations that span partitions use a two-phase commit protocol.
66 | }
67 | \label{fig:motivation:bigdata}
68 | \end{figure}
69 | 
70 | Consensus is also sometimes used to replicate very large amounts of
71 | data, as shown in Figure~\ref{fig:motivation:bigdata}. Large storage
72 | systems, such as Megastore~\cite{Baker:2011},
73 | Spanner~\cite{Corbett:2012}, and Scatter~\cite{Glendenning:2011},
74 | store too much data to fit in a single group
75 | of servers. They partition their data across many replicated state machines,
76 | and operations that span multiple partitions use a two-phase commit
77 | protocol (2PC) to
78 | maintain consistency.
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/online-trim.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongardie/dissertation/0c10ea1b53d73bf6d163bdde8e4141a1cbd8f36a/online-trim.pdf


--------------------------------------------------------------------------------
/online.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongardie/dissertation/0c10ea1b53d73bf6d163bdde8e4141a1cbd8f36a/online.pdf


--------------------------------------------------------------------------------
/performance/Makefrag:
--------------------------------------------------------------------------------
 1 | FIGGENPDF := $(FIGGENPDF) \
 2 |   performance/optimizedpipeline.pdf \
 3 |   performance/unoptimizedpipeline.pdf \
 4 |   performance/throughput.pdf \
 5 |   performance/latency.pdf
 6 | 
 7 | performance/optimizedpipeline.pdf: performance/pipeline.svg
 8 | 	inkscape -T -z -i optimized -A $@ $<
 9 | performance/unoptimizedpipeline.pdf: performance/pipeline.svg
10 | 	inkscape -T -z -i unoptimized -A $@ $<
11 | 


--------------------------------------------------------------------------------
/performance/implementation.tex:
--------------------------------------------------------------------------------
 1 | \section{Implementation}
 2 | 
 3 | We have implemented Raft as part of LogCabin, a replicated state machine
 4 | implemented
 5 | as a network service. We initially developed LogCabin to store
 6 | configuration information for RAMCloud~\cite{Ousterhout:2011} and assist
 7 | in failover of the RAMCloud coordinator. We had planned to implement
 8 | Paxos in LogCabin, but the difficulties we faced motivated us to develop
 9 | Raft. LogCabin then served as our test platform for new ideas in Raft,
10 | and also as a way to verify that we understood the issues of building a
11 | complete and practical system. The Raft implementation in LogCabin
12 | contains roughly \num{2000} lines of C++ code, not including tests, comments,
13 | or blank lines. The source code is freely available~\cite{logcabin}.
14 | Its architecture is discussed in the next section.
15 | 
16 | In addition to LogCabin, there are dozens of third-party open-source
17 | implementations of Raft in various stages of
18 | development~\cite{implementations}. Many of these use different
19 | architectures than LogCabin, such as the actor
20 | model~\cite{impl:rafter,impl:akka-raft,impl:archie-raft} or event-based
21 | programming~\cite{impl:kanaka-raft-js,impl:barge,impl:kontiki}. Various
22 | companies are also deploying Raft-based systems~\cite{implementations}.
23 | For example, Facebook is currently testing HydraBase, a fork of Apache
24 | HBase~\cite{HBase}
25 | that uses Raft for replication~\cite{HydraBase}.
26 | 
27 | \subsection{Threaded architecture}
28 | \label{performance:threads}
29 | 
30 | \begin{figure}
31 | \centering
32 | \includegraphics[scale=.4]{performance/threads}
33 | \vcaption[threaded architecture]{
34 | In LogCabin,
35 | consensus state for each server is stored in a monitor protected by a
36 | single lock, accessed by a collection of threads. The threads
37 | communicate with other servers (``peer threads''), handle incoming
38 | requests from clients and other servers (``service threads''), execute
39 | commands in the state machine (``state machine thread''), implement
40 | timeouts (``timer threads''), and write log entries to disk (``log sync
41 | thread'').
42 | }
43 | \label{fig:performance:architecture}
44 | \end{figure}
45 | 
46 | Raft lends itself to a straightforward implementation architecture using
47 | threads, as shown in Figure~\ref{fig:performance:architecture}. This is
48 | not the only possible architecture, but it is the approach we have taken
49 | in LogCabin. Each server consists of a collection of shared state
50 | variables managed in a monitor style with a single lock.  Five groups of
51 | threads call into the monitor to manipulate the state:
52 | %
53 | \begin{itemize}
54 | %
55 | \item \textbf{Peer threads:} 
56 | %
57 | There are as many peer threads as there are other servers in the
58 | cluster; each peer thread manages the RPCs to one of the other servers.
59 | Each thread enters the consensus state monitor, using a condition
60 | variable to wait for events that require communication with the given
61 | server. Then it leaves the monitor (releasing the lock) and issues an
62 | RPC. Once the RPC completes (or fails), the peer thread reenters the
63 | consensus state monitor, updates state variables based on the RPC, and
64 | waits for the next event that requires communication.
65 | %
66 | \item \textbf{Service threads:}
67 | %
68 | Several threads handle incoming requests from clients and other servers.
69 | These threads wait outside the consensus state monitor for incoming
70 | requests, then enter the monitor to carry out each request.
71 | %
72 | \item \textbf{State machine thread:}
73 | %
74 | One thread executes the state machine. It enters the consensus state
75 | monitor to wait for the next committed log entry; when an entry is
76 | available, it leaves the monitor, executes the command, and returns to
77 | the monitor to wait for the next command.
78 | %
79 | \item \textbf{Timer threads:}
80 | %
81 | One thread manages the election timer for both followers and candidates;
82 | it starts a new election once a randomized election timeout has elapsed.
83 | A second thread causes the server to return to the follower state if, as
84 | leader, it is unable to communicate with a majority of the cluster;
85 | clients are then able to retry their requests with another server (see
86 | Section~\ref{clients:findleader}).
87 | %
88 | \item \textbf{Log sync thread:} When the server is leader, one thread
89 | writes log entries durably to disk. This is done without holding the
90 | lock on the consensus state, so replication to followers can proceed in
91 | parallel; see Section~\ref{performance:leaderdisk}. For simplicity,
92 | followers and candidates write directly to disk from their service
93 | threads while holding the consensus lock; they do not use the log sync
94 | thread.
95 | %
96 | \end{itemize}
97 | 


--------------------------------------------------------------------------------
/proof/defs.tex:
--------------------------------------------------------------------------------
 1 | \newtheorem{theorem}{Theorem}
 2 | \newtheorem{definition}{Definition}
 3 | \newtheorem{lemma}{Lemma}
 4 | \newenvironment{sketch}
 5 |     {\begin{proof}[Sketch]}
 6 |     {\phantom{\qedhere}\end{proof}}
 7 | \newenvironment{assertion}
 8 |     {\begin{proof}[Assertion]}
 9 |     {\phantom{\qedhere}\end{proof}}
10 | 
11 | \newcommand\fn[1]{\textsc{#1}}
12 | \newcommand\tab{\ \ \ \ }
13 | 
14 | \newcommand\sland{\ \land\ }
15 | \newcommand\slor{\ \lor\ }
16 | 
17 | \newcommand\ctrl[1]{\mbox{\textbf{#1}}}
18 | \newcommand\cif{\ctrl{if }}
19 | \newcommand\cthen{\ctrl{ then}}
20 | \newcommand\celse{\ctrl{else}}
21 | \newcommand\celif{\ctrl{elif }}
22 | \newcommand\clet{\ctrl{let }}
23 | \newcommand\cin{\ctrl{ in}}
24 | \newcommand\cforall{\ctrl{for all }}
25 | \newcommand\cdo{\ctrl{ do}}
26 | 
27 | \newcommand\msg[1]{\textsc{#1}}
28 | \newcommand\ClientRequest{\msg{ClientRequest}}
29 | \newcommand\ClientResponse{\msg{ClientResponse}}
30 | \newcommand\RequestVoteRequest{\msg{RequestVoteRequest}}
31 | \newcommand\RequestVoteResponse{\msg{RequestVoteResponse}}
32 | \newcommand\AppendEntriesRequest{\msg{AppendEntriesRequest}}
33 | \newcommand\AppendEntriesResponse{\msg{AppendEntriesResponse}}
34 | 
35 | \newcommand\dec[1]{\mathcal{#1}}
36 | \newcommand\messages{\dec{M}}
37 | \newcommand\replicas{\dec{R}}
38 | \newcommand\peers{\replicas - \{i\}}
39 | \newcommand\ppeers{(\replicas - \{i\})}
40 | \newcommand\operations{\dec{O}}
41 | \newcommand\clients{\dec{C}}
42 | \newcommand\indexes{\dec{I}}
43 | \newcommand\terms{\dec{T}}
44 | \newcommand\seqs{\dec{S}}
45 | \newcommand\values{\dec{V}}
46 | \newcommand\power{\dec{P}}
47 | \newcommand\g{\dec{G}}
48 | \newcommand\booleans{\dec{B}}
49 | 
50 | \newcommand\st[1]{\textsc{#1}}
51 | \newcommand\follower{\st{follower}}
52 | \newcommand\candidate{\st{candidate}}
53 | \newcommand\leader{\st{leader}}
54 | 
55 | \newcommand\startfn{\tab\=\+Precondition: \=\+\kill}
56 | \newcommand\precond{\<Precondition:\>}
57 | \newcommand\effects{\<Effect:\>}
58 | 
59 | \newcommand\is{\triangleq}
60 | \newcommand\be{\is}
61 | \newcommand\domain{\mbox{DOMAIN }}
62 | \newcommand\cat{\ \|\ }
63 | 


--------------------------------------------------------------------------------
/related/Makefrag:
--------------------------------------------------------------------------------
1 | FIGGENPDF := $(FIGGENPDF) \
2 |   related/rsm.pdf \
3 |   related/primarybackup.pdf
4 | 
5 | related/rsm.pdf: related/primarybackup-rsm.svg
6 | 	inkscape -T -z -i rsm -A $@ $<
7 | related/primarybackup.pdf: related/primarybackup-rsm.svg
8 | 	inkscape -T -z -i primarybackup -A $@ $<
9 | 


--------------------------------------------------------------------------------
/related/algos.tex:
--------------------------------------------------------------------------------
  1 | \section{Overview of consensus algorithms}
  2 | \label{related:overview}
  3 | 
  4 | This section introduces existing consensus algorithms that are
  5 | comparable to Raft, specifically Paxos, Viewstamped Replication, and
  6 | Zab. Like Raft, these algorithms handle
  7 | fail-stop but not Byzantine failures, and they do not rely on time for
  8 | safety (the key properties of practical consensus algorithms can be
  9 | found in Section~\ref{motivation:problem}). Readers may also be
 10 | interested in van Renesse \emph{et al.}'s more theoretical comparison of
 11 | these algorithms~\cite{Renesse:2014}.
 12 | 
 13 | Other consensus algorithms exist for
 14 | different system models, but these are less commonly used. Notably, some
 15 | algorithms address Byzantine consensus, where arbitrary failures and
 16 | misbehaviors are possible~\cite{Castro:1999,Liskov:2010,Martin:2005};
 17 | these are more complex and lower in performance than algorithms under
 18 | the fail-stop model.
 19 | 
 20 | \subsection{Paxos}
 21 | 
 22 | Paxos (most commonly Multi-Paxos) is the most widely deployed consensus
 23 | algorithm today:
 24 | %
 25 | \begin{itemize}
 26 | %
 27 | \item Several Google systems use Paxos, including the
 28 | Chubby~\cite{Burrows:2006, Chandra:2007} lock service and the
 29 | Megastore~\cite{Baker:2011} and Spanner~\cite{Corbett:2012} storage
 30 | systems. Chubby is used for cluster metadata, whereas
 31 | Megastore and Spanner use Paxos for all of their data storage.
 32 | %
 33 | \item Microsoft also uses Paxos in various systems. Microsoft's
 34 | Autopilot service~\cite{Isard:2007} (used by Bing) and Windows Azure
 35 | Storage~\cite{Calder:2011} use Paxos for metadata. Azure's Active
 36 | Directory Availability Proxy~\cite{azure:availability} uses Paxos to
 37 | agree on a series of requests for arbitrary REST services.
 38 | %
 39 | %
 40 | %
 41 | \item The open-source Ceph storage system uses Paxos to store its
 42 | \emph{cluster map}, the data structure that allows clients to find
 43 | where objects are located~\cite{Weil:2006,ceph:monitor}.
 44 | %
 45 | \item Recently, eventually-consistent data stores such as
 46 | Cassandra~\cite{Cassandra} and Riak~\cite{Riak} have added Paxos to
 47 | provide linearizable access for some data. Cassandra appears to
 48 | use an unoptimized implementation of Basic Paxos~\cite{Ellis:2013}, and
 49 | a future release of Riak will include an implementation of
 50 | Multi-Paxos~\cite{Blomstedt:2013}.
 51 | %
 52 | %
 53 | \end{itemize}
 54 | 
 55 | Paxos is a broad term for a whole family of consensus protocols.
 56 | Lamport's original description of Paxos~\cite{Lamport:1998}
 57 | presents sketches for a complete system but not in
 58 | enough detail to implement.
 59 | Several subsequent papers attempt to explain Paxos~\cite{Lamport:2001,
 60 | Lampson:1996, Lampson:2001}, but they also
 61 | don't explain their algorithms completely enough to implement. There are
 62 | many other elaborations of Paxos, which fill in missing details and
 63 | modify Paxos to provide a better foundation for
 64 | implementation~\cite{Renesse:2011, Kirsch:2008}. Additionally, we
 65 | developed our own explanation for and elaboration of Paxos in a video
 66 | lecture as part of the Raft user study~\cite{study}; the Multi-Paxos
 67 | variant we used is summarized in
 68 | Figure~\ref{fig:appendix:userstudy:paxossummary1}. Unfortunately, all of
 69 | these elaborations of Paxos differ from each other. This is burdensome
 70 | for readers, and it also makes comparisons difficult.
 71 | %
 72 | Ultimately, most implementations bear little resemblance to the Paxos
 73 | literature, and some may even deviate so far from Paxos as to resemble Raft.
 74 | After reading an earlier draft of the Raft paper, one Spanner developer
 75 | made the following remark during a talk:
 76 | %
 77 | \begin{quote}
 78 | %
 79 | Our Paxos implementation is actually closer to the Raft algorithm than
 80 | to what you read in the Paxos paper.~\cite{Kanthak:2013}
 81 | %
 82 | \end{quote}
 83 | %
 84 | For the purpose of
 85 | this chapter, we have tried to compare Raft to common ideas found in
 86 | Multi-Paxos elaborations, but we did not limit our discussion to a
 87 | particular algorithm.
 88 | 
 89 | Chapter~\ref{motivation} discussed how Paxos is difficult to understand
 90 | and is a poor foundation for building systems. Its single-decree
 91 | formulation is difficult to decompose, and Multi-Paxos leaves the log
 92 | with too much nondeterminism and too little structure (e.g., it can have
 93 | holes). Multi-Paxos uses only a very weak form of leadership as a
 94 | performance optimization. These problems make Paxos needlessly complex,
 95 | which burdens both students and systems builders.
 96 | 
 97 | \subsection{Leader-based algorithms}
 98 | 
 99 | Viewstamped Replication and Zab are two leader-based consensus
100 | algorithms that are closer in structure to Raft and therefore share many
101 | of Raft's advantages over Paxos. As in Raft, each algorithm first elects
102 | a leader, then has that leader manage the replicated log. The algorithms
103 | differ from Raft in how they handle leader election and repairing
104 | inconsistencies in the logs after leader changes; the next sections
105 | in this chapter go into more details on these differences.
106 | 
107 | Oki and Liskov's Viewstamped Replication is a leader-based consensus
108 | algorithm developed around the same time as Paxos. The original
109 | description~\cite{Oki:1988,Oki:1988t} was intertwined with a protocol
110 | for distributed transactions, which may have caused many readers to
111 | overlook its contributions. The core consensus algorithm has been
112 | separated in a recent update called Viewstamped Replication
113 | Revisited~\cite{Liskov:2012}, and \mazieres~\cite{Mazieres:2007} also
114 | expanded on the details of the core algorithm before Liskov's update.
115 | Though Viewstamped Replication is not widely used in practice, it was
116 | used in the Harp File System~\cite{Liskov:1991}.
117 | 
118 | Zab~\cite{Junqueira:2011}, which stands for ZooKeeper Atomic Broadcast,
119 | is a much more recent algorithm that resembles Viewstamped
120 | Replication. It is used in the Apache ZooKeeper
121 | coordination service~\cite{Hunt:2010}, which is the most popular
122 | open-source consensus system today. A cluster membership change
123 | mechanism was recently developed for Zab~\cite{Shraer:2012} and is
124 | scheduled for a future ZooKeeper release~\cite{ZOOKEEPER-107}.
125 | 
126 | Raft has less mechanism than Viewstamped Replication and Zab
127 | because it minimizes the functionality in non-leaders. For example, we
128 | counted the message types Viewstamped Replication Revisited and Zab use
129 | for basic consensus and membership changes (excluding log compaction and
130 | client interaction, as these are nearly independent of the algorithms).
131 | Viewstamped Replication Revisited and Zab each define 10 different
132 | message types, while Raft has only 4 message types (two RPC requests and
133 | their responses). Raft's messages are a bit more dense than the other
134 | algorithms', but they are simpler collectively. In addition, Viewstamped
135 | Replication and Zab are described in terms of transmitting entire logs
136 | during leader changes; additional message types will be required to
137 | optimize these mechanisms so that they are practical.
138 | 
139 | Zab presents a slightly stronger guarantee than Raft for clients issuing
140 | concurrent requests. If a client pipelines multiple requests, Zab
141 | guarantees that they are committed in order (if at all); this property is
142 | called \emph{FIFO client order}. For example, this allows a client to
143 | issue a bunch of changes and then release a lock, all asynchronously;
144 | other clients will see the changes reflected in the replicated state
145 | machine before they see the lock released. Paxos does not satisfy this
146 | property, since commands are assigned to log entries with few
147 | constraints; see~\cite{Junqueira:2011}. Raft and Viewstamped Replication
148 | could provide the same guarantee as Zab, since their leaders append new
149 | entries in order to the log. However, some extra care would be required to
150 | prevent network and client retries from reordering the client's commands
151 | to leaders.
152 | 


--------------------------------------------------------------------------------
/related/compaction.tex:
--------------------------------------------------------------------------------
 1 | \section{Log compaction}
 2 | \label{related:compaction}
 3 | 
 4 | Log compaction is a necessary component of any consensus-based system,
 5 | but unfortunately, the topic is neglected in many papers. We can think
 6 | of two reasons why this might be the case:
 7 | %
 8 | \begin{enumerate}
 9 | %
10 | \item Most of the issues of log compaction are equally applicable
11 | to all consensus algorithms. All algorithms must eventually commit each
12 | log entry, and committed entries can then be compacted without affecting
13 | the consensus algorithm much (since consensus has already been reached).
14 | Thus, from a theoretical point of view, compaction is nearly orthogonal
15 | to the consensus algorithm and may not logically belong in a paper about a
16 | consensus algorithm.
17 | %
18 | \item Log compaction involves a large number of design choices,
19 | and some of these may vary by implementation. Different approaches trade
20 | off complexity, performance, and resource utilization in different ways,
21 | and implementations may vary significantly in their requirements (for
22 | example, ranging from very small to very large state machines). Some
23 | authors attempt to describe algorithms in the most general terms
24 | possible, and it is difficult to be inclusive of all possible
25 | implementations when facing such a large design space.
26 | %
27 | \end{enumerate}
28 | 
29 | This dissertation discussed several forms of log compaction. The
30 | biggest design choice is between incremental approaches (described in
31 | Section~\ref{compaction:incremental}), and snapshotting, which is
32 | simpler but less efficient.
33 | Many consensus-based systems use some form of snapshotting.
34 | Raft's snapshotting approach is very similar to that of Chubby~\cite{Chandra:2007},
35 | and a similar snapshotting approach is outlined briefly in Viewstamped
36 | Replication Revisited~\cite{Liskov:2012}.
37 | 
38 | ZooKeeper~\cite{Hunt:2010} uses \emph{fuzzy snapshots}: rather than
39 | taking a consistent snapshot using copy-on-write techniques, a snapshot
40 | in ZooKeeper can partially reflect later changes, thereby not
41 | representing the state of the system at a particular point in time. The
42 | changes that may or may not have already been applied to the snapshot
43 | are reapplied on server startup, resulting in a consistent state.
44 | Unfortunately, fuzzy snapshots are covered by a US
45 | patent~\cite{Reed:2010}, and they are also more difficult to reason
46 | about than consistent snapshots.
47 | 


--------------------------------------------------------------------------------
/related/correctness.tex:
--------------------------------------------------------------------------------
 1 | \section{Correctness}
 2 | \label{related:correctness}
 3 | 
 4 | The consensus community has primarily focused its correctness efforts on
 5 | proofs of safety. Most of the widely accepted consensus algorithms have
 6 | been proven safe in some form, including single-decree
 7 | Paxos~\cite{Lamport:1998,Prisco:2000,Lamport:2013e},
 8 | Multi-Paxos~\cite{Boichat:2003,Schiper:2014},
 9 | EPaxos~\cite{Moraru:2013tr}, and
10 | Zab~\cite{Junqueira:2010}. We have only found informal sketches for
11 | Viewstamped Replication~\cite{Liskov:2012}.
12 | 
13 | There are various approaches to proofs. On one axis, proofs range from
14 | less formal to more formal. Informal sketches are useful for building intuition
15 | but might overlook errors. 
16 | For Raft, we have developed a fairly detailed (semi-formal) proof and
17 | have also included informal sketches for intuition.
18 | The most formal proofs are machine-checked; they are so precise that a
19 | computer program can verify their correctness.
20 | These proofs are not always easy to understand, but they
21 | establish the truth of the statements proven with complete certainty.
22 | Machine-checked proofs are not yet standard in distributed systems (they
23 | are more popular in, for example, the programming languages community),
24 | and we struggled to create one ourselves.
25 | However, recent work argues for this
26 | approach~\cite{Lamport:2011,Schiper:2014}, and the
27 | EventML~\cite{Schiper:2014} authors have shown their approach can be
28 | feasible for consensus by proving Multi-Paxos correct. Pairing
29 | machine-checked proofs with informal sketches can get the best of both
30 | worlds, and we hope to see the distributed systems community move in
31 | that direction.
32 | 
33 | Proofs also range in how directly they apply to real-world systems. Some
34 | prove properties on very simplified models; these can aid understanding
35 | but have limited direct value for the correctness of complete systems.
36 | For example, real systems vary so much from single-decree Paxos that
37 | they may not benefit much from its proofs. Other proofs operate on more
38 | complete specifications (e.g., the Raft proof presented in
39 | Appendix~\ref{appendix:correctness} and the proof for
40 | EPaxos~\cite{Moraru:2013tr}); real-world implementations are
41 | closer to these specifications, so these proofs are closer to proving
42 | properties on real-world code. Some proof systems can even generate
43 | working implementations, which eliminates the possibility of errors in
44 | translation from the specification to the implementation (e.g.,
45 | EventML~\cite{Schiper:2014}). However, this approach has not been very
46 | popular in practice so far, perhaps because real-world systems have
47 | additional needs, such as performance, that are harder to accommodate in
48 | the generated code.
49 | 
50 | We have not found many proofs of liveness or availability (nor have we
51 | contributed any for Raft). These properties may be harder to formalize,
52 | but we hope to see a greater emphasis on this in the future.
53 | 


--------------------------------------------------------------------------------
/related/leaderelection.tex:
--------------------------------------------------------------------------------
  1 | \section{Leader election}
  2 | \label{related:leaderelection}
  3 | 
  4 | This section discusses how different consensus algorithms address leader
  5 | election. Raft uses an approach with very little mechanism, while other
  6 | algorithms are generally more complex without offering practical
  7 | advantages.
  8 | 
  9 | In a broad sense, leader election includes the following four issues,
 10 | which the following subsections discuss in depth:
 11 | %
 12 | \begin{enumerate}
 13 | %
 14 | \item \textbf{Detecting a failed leader.}\\
 15 | %
 16 | Raft uses heartbeats and timeouts.
 17 | %
 18 | \item \textbf{Neutralizing deposed leaders.}\\
 19 | %
 20 | In Raft, candidates propagate a new term number while soliciting votes
 21 | and replicating the log.
 22 | %
 23 | \item \textbf{Selecting a server to be the new leader.}\\
 24 | %
 25 | Raft uses randomized timeouts, and the first candidate to time out
 26 | usually becomes leader. Voting ensures that there is at most one leader
 27 | per term.
 28 | %
 29 | \item \textbf{Ensuring the leader has all committed entries.}\\
 30 | %
 31 | In Raft, the log comparison check during voting ensures that a new
 32 | leader already has all committed entries; no log entries are
 33 | transferred.
 34 | %
 35 | \end{enumerate}
 36 | 
 37 | \subsection{Detecting and neutralizing a failed leader}
 38 | 
 39 | In all practical settings, it is impossible to distinguish a failed server from a
 40 | slow server; this is the key characteristic of an asynchronous system.
 41 | Fortunately, practical consensus algorithms preserve safety even if
 42 | leaders are suspected of failing when they are simply slow. Thus,
 43 | failure detection only needs to detect failed servers eventually
 44 | (\emph{completeness}) and not suspect available servers with high
 45 | probability (\emph{accuracy}). These weak requirements are easily
 46 | satisfied in practical systems by using heartbeats and timeouts.
 47 | 
 48 | Various failure detectors built on heartbeats and timeouts have been
 49 | discussed in the theoretical literature~\cite{Chandra:1996}. $\lozenge
 50 | P$ (or equivalently, $\Omega$) is a failure detector with nice
 51 | theoretical properties: eventually (after some unknown period of time),
 52 | it will be perfectly correct and accurate. It does so by increasing its
 53 | timeouts every time a suspicion is incorrect; eventually, its timeouts
 54 | will be so large that it makes no false suspicions.
 55 | However, this behavior is
 56 | impractical for real systems, which care about availability:
 57 | if the timeout value grows too large, the
 58 | cluster will wait too long to detect a leader failure. It is better to
 59 | falsely suspect a leader of failure when it is slow than to wait around
 60 | to be sure. Therefore, Raft's timeouts are fixed low enough to
 61 | satisfy the system's availability requirements.
 62 | 
 63 | Paxos, Zab, and Viewstamped Replication either do not specify a failure
 64 | detector or briefly mention the use of timeouts but do not spell out the
 65 | details. This may be because approaches to failure detection are mostly
 66 | independent of the consensus algorithm. However, we found that combining
 67 | heartbeats with other messages has practical benefits. For example,
 68 | Raft's AppendEntries RPC not only serves as a heartbeat but also informs
 69 | followers of the latest commit index.
 70 | 
 71 | Since failure detectors can mistakenly report the leader as having
 72 | failed when it is in fact slow, a suspected leader must be neutralized.
 73 | The various consensus algorithms handle this similarly using a
 74 | monotonically increasing number (called a term in Raft, a proposal
 75 | number in Paxos, a view in Viewstamped Replication, or an epoch in Zab).
 76 | Once a server has seen a larger number, it will no longer accept
 77 | requests from a leader with a smaller number. Most algorithms, including
 78 | Raft, inform the sender that it is stale when a server receives such
 79 | a request; in some descriptions of Paxos, however, the recipient does
 80 | not reply.
 81 | 
 82 | Algorithms assign term numbers to servers in two different ways. Zab and
 83 | Raft use voting to ensure there is at most one leader per term: if a
 84 | server is able to collect a majority of votes, it has exclusive use of
 85 | that term number for replicating log entries. Paxos and Viewstamped
 86 | Replication divide the space of numbers so that servers do not compete
 87 | for particular numbers (e.g., by allocating numbers to servers in a
 88 | round-robin fashion). There does not seem to be a practical difference
 89 | between these two approaches, since voting must occur in either case.
 90 | 
 91 | \subsection{Selecting a new leader and ensuring it has all committed
 92 | entries}
 93 | 
 94 | \begin{table}
 95 | \centering
 96 | \begin{tabular}{lccc}
 97 | algorithm & new leader                & vote collector & handles preferences \\
 98 | \hline
 99 | \noalign{\vskip .75ex}
100 | Paxos     & any server                & new leader & yes\\
101 | VR        & has up-to-date log        & view manager & no \\
102 | VRR       & determined by view number & new leader & no \\
103 | Zab       & any server                & new leader & yes \\
104 | Raft      & has up-to-date log        & new leader & no
105 | \end{tabular}
106 | \vcaption[summary of how different algorithms select a new leader]{
107 | Summary of how different algorithms select a new leader. The ``new
108 | leader'' column shows which servers may become the new leader. The
109 | ``vote collector'' column shows which server solicits votes; in all but
110 | the original Viewstamped Replication paper, this is the candidate for
111 | leadership. The ``handles preferences'' column shows which algorithms
112 | are able to accommodate preferences in which server becomes leader
113 | during election; other algorithms would need separate leadership transfer
114 | mechanisms to accommodate this.
115 | }
116 | \label{tab:related:leaderelection}
117 | \end{table}
118 | 
119 | Algorithms differ in which server they select as leader, as summarized
120 | in Table~\ref{tab:related:leaderelection}. Paxos and Zab choose any
121 | server as leader, while the other algorithms restrict which server can
122 | become leader. One advantage of Paxos and Zab's approach is that they
123 | can accommodate preferences about which server should be leader during
124 | leader election. For example, if a deployment performs best when a
125 | server from a particular datacenter acts as leader, Paxos or Zab can
126 | allow that server to become leader. The other algorithms are not able to
127 | do so because they constrain which server may become leader; they need a
128 | separate leadership transfer mechanism (as described in
129 | Chapter~\ref{basicraft} for Raft) to accommodate such preferences.
130 | 
131 | Viewstamped Replication Revisited uses a different round-robin
132 | approach for choosing which server becomes leader. The leader is a
133 | function of the view (term) number: in an $n$-server cluster, a server $i$ is
134 | the leader for view $v$ if $v\ \%\ n = i$. This approach has the
135 | advantage that clients can likely guess and find the leader based on the
136 | current view number (to do this, clients must track the current
137 | configuration and view number). However, it may result in additional
138 | delays if the designated leader for a view is unavailable or if servers
139 | have different notions of the current view.
140 | 
141 | The original Viewstamped Replication algorithm is closest to Raft in
142 | that only a server whose log is as up-to-date as a majority of the
143 | cluster can become leader. This has a big advantage in that it avoids
144 | transferring log entries to the new leader; it simplifies the flow of
145 | data to go only from clients to leaders to followers. Viewstamped
146 | Replication uses one server to manage the election process (the view
147 | manager) and a different server becomes the leader. The view manager
148 | chooses the server with the most up-to-date log of a majority of the
149 | cluster to be the new leader, then informs that server of its new
150 | leadership role. In Raft, the same server both runs the election and
151 | becomes leader, which avoids some mechanism and reduces state space
152 | complexity. Zab also suggests choosing the new leader as having a
153 | sufficiently up-to-date log (like Raft) as a possible optimization, and
154 | this optimization is apparently implemented in
155 | ZooKeeper~\cite{ZooKeeperPersonalCommunication}.
156 | 
157 | Paxos, Viewstamped Replication Revisited, and (unoptimized) Zab need
158 | additional mechanism to ensure the new leader has all committed entries,
159 | since they do not choose the leader based on its log. In Paxos, the
160 | leader typically runs both phases of single-decree Paxos for each log
161 | entry in which it does not know the committed value, until it reaches a
162 | log index for which no available server has seen any more proposals.
163 | This may result in significant delays until the new leader catches up.
164 | Viewstamped Replication Revisited and Zab are described as if servers
165 | send their entire logs to the new leader and the new leader adopts the
166 | most up-to-date one. This is a nice model but is impractical for large
167 | logs; both papers suggest optimizing this by sending fewer entries but
168 | do not spell out the details.
169 | 


--------------------------------------------------------------------------------
/related/logreplication.tex:
--------------------------------------------------------------------------------
  1 | \section{Log replication and commitment}
  2 | \label{related:logreplication}
  3 | 
  4 | All consensus algorithms specify how to send new log entries to other
  5 | servers and when to mark them committed. This is usually done in one
  6 | round of communication from the leader in the normal case, and it is
  7 | usually straightforward to apply batching and pipelining to make
  8 | replicating multiple entries faster.
  9 | 
 10 | The algorithms differ in how far they can proceed out of order. Raft,
 11 | Zab, and Viewstamped Replication must all append and commit entries to
 12 | the log in order, so that followers' logs always remain consistent with
 13 | the leader's. Traditionally, Multi-Paxos allows servers to accept and
 14 | commit values for entries in any order. This does not offer Paxos a
 15 | significant performance advantage, however, since commands must still be
 16 | applied to the state machines in order. Raft and the other algorithms
 17 | that maintain a log in order can also transmit log entries out of
 18 | order; they just cannot be appended to the log this way. (In these
 19 | algorithms, servers could buffer the entries outside the log until they
 20 | are ready to be appended, if desired.)
 21 | 
 22 | \begin{figure}
 23 | \centering
 24 | \includegraphics[scale=.50]{related/rereplicate}
 25 | \vcaption[differences in how new leaders replicate existing entries]{
 26 | Example of how algorithms differ in which entries a new leader
 27 | replicates from its log. In Paxos, the new leader for term~4 executes
 28 | phases~1 and~2 of Paxos for entries 4--8 using its new proposal number,
 29 | since it does
 30 | not believe that those are committed. As described in the Viewstamped
 31 | Replication and Zab papers, the new leader replicates its entire log to
 32 | the follower. In Raft, the leader only transmits entries 5--8 to the
 33 | follower, the minimal number of entries required.
 34 | }
 35 | \label{fig:related:rereplicate}
 36 | \end{figure}
 37 | 
 38 | The algorithms also differ in what new leaders do with existing entries
 39 | in their logs, as illustrated in Figure~\ref{fig:related:rereplicate}:
 40 | %
 41 | \begin{itemize}
 42 | %
 43 | \item In Paxos, a new leader goes through the two phases of
 44 | single-decree Paxos for each uncommitted entry it finds, rewriting and
 45 | renumbering them all with its current proposal number. This either
 46 | commits the local value or discovers an existing committed value.
 47 | Meanwhile, it can replicate and commit but not yet apply client commands
 48 | in further log slots.
 49 | %
 50 | \item In Viewstamped Replication Revisited and Zab, a new leader
 51 | transfers its entire initial log to each follower before starting its
 52 | term, and the entire log is effectively renumbered with the new view.
 53 | This is impractical for large logs and should be optimized to send fewer
 54 | entries in practice, but the details have not been published. It is
 55 | fairly easy to determine which entries to send if the two servers both
 56 | participated in the last view but more difficult to determine otherwise
 57 | (without the term numbers in each entry as in the figure, one idea would
 58 | be to compare cumulative hashes of log prefixes).
 59 | %
 60 | \item
 61 | A new leader in Raft transfers just the minimal number of entries to
 62 | make other servers' logs match its own. After some back-and-forth with
 63 | heartbeats to discover where the logs diverge, the only entries that are
 64 | transferred are those that differ.
 65 | %
 66 | Key to this feature is that entries are not renumbered, so the same
 67 | entry will have the same index and term across logs for all time.
 68 | Without this property, some servers would have an entry under its
 69 | original term number, and others would have it under new term numbers. A
 70 | subsequent leader would have to needlessly overwrite some of these
 71 | copies, since it wouldn't know which ones contain the same command.
 72 | %
 73 | \end{itemize}
 74 | 
 75 | By transferring log entries rather than logs, Raft allows more
 76 | intermediate states than VR and Zab. These intermediate states
 77 | are ambiguous in Raft, thus cannot be used for commitment (see
 78 | Figure~\ref{fig:basicraft:oldTermCommit}). This has three
 79 | consequences.
 80 | 
 81 | First, if we could somehow observe a snapshot of an entire cluster, an
 82 | entry in Raft can be present on a majority of servers but not committed.
 83 | Instead, to determine whether an entry is committed, one must ask if
 84 | future leaders must have the entry: does every server that could be
 85 | elected leader with its current log have the entry in its log? If so,
 86 | the entry is committed; otherwise, it is not. This requires more complex
 87 | reasoning for an omniscient observer than in other algorithms: rather
 88 | than counting how many replicas of the entry exists, one must
 89 | essentially execute the consensus algorithm.
 90 | 
 91 | Second, during operation, Raft has a two-part commitment rule, in which
 92 | entries from prior terms are not directly marked committed; they are
 93 | only marked committed once an entry from the current term has reached a
 94 | majority of the cluster (at this point, any ambiguity is resolved). This
 95 | does not significantly burden implementations, which only need a single
 96 | additional \emph{if} statement. Interestingly, this commitment rule
 97 | would not be possible in a single-decree consensus formulation; it
 98 | relies on the log formulation so that later entries can commit earlier
 99 | ones.
100 | 
101 | Finally, infinite leader changes can require infinite space in Raft.
102 | Specifically, a leader has to create an entry in order to commit
103 | previous entries in order to compact them, but if it crashes first, its
104 | log will then contain an additional entry. In theory, this process could repeat
105 | and exhaust storage capacity. However, we don't believe this to be a
106 | significant practical concern, since it would be unlikely for leader
107 | election to succeed so frequently yet leaders to fail so frequently.
108 | 
109 | An alternative to Raft's commitment approach would be to add an extra
110 | term to logs, similar to Viewstamped Replication Revisited. The log's
111 | term would be the term of the latest leader to replicate an entry to the
112 | log. The log's term would usually be the same as the term of the last
113 | entry in the log, but it would be ahead briefly while new leaders catch
114 | followers up to match the leader's initial log. If the log's term was
115 | used during elections instead of the term of the last entry, then the
116 | commitment rule could be simplified: commitment would require a majority
117 | of servers to have the entry and the same log term. Based on its
118 | similarity to Viewstamped Replication, we think this approach would
119 | work, though we haven't proved it correct. The downside is that this
120 | results in three terms to juggle: the server's current term, the
121 | log's term, and the terms in the individual entries. We think delaying
122 | commitment until the ambiguity is resolved is easier.
123 | 


--------------------------------------------------------------------------------
/related/primarybackup.tex:
--------------------------------------------------------------------------------
 1 | \section{Replicated state machines vs.\ primary copy approach}
 2 | \label{related:rsm}
 3 | 
 4 | \begin{figure}
 5 | \centering
 6 | 
 7 | \begin{subfigure}{\textwidth}
 8 | \centering
 9 | \includegraphics[scale=0.50]{related/rsm}
10 | \caption{
11 | Traditional replicated state machine approach.
12 | }
13 | \end{subfigure}
14 | 
15 | \vspace{4ex}
16 | 
17 | \begin{subfigure}{\textwidth}
18 | \centering
19 | \includegraphics[scale=0.50]{related/primarybackup}
20 | \caption{
21 | Primary copy approach.
22 | }
23 | \end{subfigure}
24 | 
25 | \vcaption[primary copy architecture]{
26 | In the primary copy architecture, the primary's state machine
27 | processes requests from clients and calculates resulting states, which
28 | its consensus module replicates into the servers' logs. The figure shows
29 | a client submitting a request to increment a variable $y$, which the
30 | primary translates into an operation to set $y$ to $2$.
31 | }
32 | \label{fig:related:primarybackup}
33 | \end{figure}
34 | 
35 | The original Viewstamped Replication paper and ZooKeeper operate
36 | slightly differently from traditional replicated
37 | state machines, using a \emph{primary copy} architecture instead.
38 | The primary copy architecture is
39 | illustrated in Figure~\ref{fig:related:primarybackup}. It is similar to
40 | replicated state machines in that each server still has a consensus
41 | module, a state machine, and a log. However, the primary's (leader's) state machine
42 | processes requests as soon as they arrive from clients,
43 | instead of waiting for them to be committed. It then computes
44 | the state resulting from each request, and the final state, rather than
45 | the original requests, is replicated in the log using consensus.
46 | Once the log entries are committed, the effects of the client requests
47 | are externalized to clients.
48 | (For linearizability, the primary should also include client responses in
49 | the log entries, allowing backups servers to return the same response in
50 | case clients retry; see Chapter~\ref{clients}.)
51 | 
52 | From the point of view of the consensus algorithm, the primary copy
53 | approach is very similar to replicated state machines. Thus, nearly all
54 | of the Raft algorithm applies equally well to the primary copy approach.
55 | However, the state machine and overall system are somewhat more complex
56 | in the primary copy approach. They differ in three ways.
57 | 
58 | First, the primary's state machine in primary copy systems reflects
59 | uncommitted entries in the log, whereas in replicated state machines,
60 | the state machines only reflect committed entries. This distinction is
61 | necessary for primaries to produce the resulting states when they
62 | receive client requests, but it introduces two complications: the state
63 | machine must take caution not to externalize any uncommitted state, and
64 | if another server becomes the primary, the old primary's state machine
65 | needs to roll back its recent uncommitted changes.
66 | 
67 | Second, the log in the replicated state machine approach includes all
68 | client requests, even those that ended up having no effect. For example,
69 | a conditional write operation whose condition was not met would still
70 | occupy space in the log. In the primary copy approach, the primary would
71 | not need to append anything new to its log for such failed operations
72 | (it would only need to wait until it was safe to externalize the
73 | response). On the other hand, this is unlikely to have a significant
74 | effect on the system's capacity, as logs must eventually be compacted in
75 | either approach.
76 | 
77 | Third, the state machines in the replicated state machine approach must
78 | be deterministic, since every server must arrive at the same result
79 | after applying the same series of client requests. For example, the
80 | effects of client requests must not depend on each server's current
81 | time. In the primary copy approach, however, the primary's state machine
82 | need not be deterministic; it may do anything it likes with the request,
83 | as long as the state change it produces is deterministic. Fortunately, a
84 | hybrid approach allows replicated state machines to overcome this
85 | limitation in most cases: the server receiving a client request can
86 | augment that request with additional nondeterministic inputs, such as
87 | its current time and a random number, before appending the request into
88 | the replicated logs. All of the servers' state machines can then process
89 | the augmented request deterministically.
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/related/related.tex:
--------------------------------------------------------------------------------
 1 | \chapter{Related work}
 2 | \label{related}
 3 | 
 4 | This chapter discuss the strengths and weaknesses of Raft in the context
 5 | of related work. Section~\ref{related:overview} first gives a brief
 6 | introduction to other consensus algorithms and compares them to Raft at
 7 | a high level. Then,
 8 | Sections~\ref{related:leaderelection}--\ref{related:correctness} focus
 9 | on more specific details of how these consensus algorithms compare to
10 | Raft. Finally, Section~\ref{related:understandability} discusses work
11 | related to evaluating understandability.
12 | 
13 | 
14 | 
15 | \input{related/algos}
16 | \input{related/leaderelection}
17 | \input{related/logreplication}
18 | \input{related/membership}
19 | \input{related/compaction}
20 | \input{related/primarybackup}
21 | \input{related/performance}
22 | \input{related/correctness}
23 | \input{related/understandability}
24 | 


--------------------------------------------------------------------------------
/related/understandability.tex:
--------------------------------------------------------------------------------
 1 | \section{Understandability}
 2 | \label{related:understandability}
 3 | 
 4 | Studies involving human factors are common in other areas of computer
 5 | science, namely Human-Computer Interaction (HCI). HCI researchers
 6 | typically iterate on designs using empirical measurements, using
 7 | incremental results from the study to guide improvements to their
 8 | designs. To make this possible, the study must be relatively easy to
 9 | repeat and relatively low in cost. A typical HCI study asks participants
10 | to learn and perform a task using a user interface, which takes little
11 | preparation and may only require a few minutes per participant. In
12 | contrast, our primary goal was to compare Raft and Paxos, not to iterate
13 | on Raft, and the cost of the Raft study made it difficult to apply an
14 | iterative approach (we needed to prepare teaching materials and quizzes,
15 | and each participant needed to invest several hours in the study). Now
16 | that we have shown that Raft is easier to understand than Paxos, it may
17 | be feasible to do further iterative studies (A/B testing) to find better
18 | variations of Raft or better variations of its explanation.
19 | 
20 | Side-stepping human factors altogether,
21 | NetComplex~\cite{Chun:2008} proposed a ``metric to quantify the notion
22 | of algorithmic complexity in network system design''. The metric
23 | calculates the distributed dependencies of state, where the complexity
24 | of each state variable is the sum of the complexity of its dependencies.
25 | The paper also compares the complexity of two-phase commit and
26 | single-decree Paxos according to this metric; as expected, it finds
27 | Paxos to be more complex.
28 | 
29 | Clearly a formula for quantifying the complexity or understandability of
30 | an algorithm would be very useful. However, we do not know whether the
31 | formula proposed in the NetComplex paper is the right one. Many factors
32 | contribute to complexity, and their relative importance and the
33 | interactions between them are not well understood. It is also not
34 | obvious how to apply the proposed formula to the complete Raft
35 | algorithm, which is much larger than the examples given in the paper
36 | (but we would be very interested in seeing the result).
37 | 


--------------------------------------------------------------------------------
/stanford.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongardie/dissertation/0c10ea1b53d73bf6d163bdde8e4141a1cbd8f36a/stanford.pdf


--------------------------------------------------------------------------------
/userstudy/breakdownlegend.svg:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 | <!-- Created with Inkscape (http://www.inkscape.org/) -->
  3 | 
  4 | <svg
  5 |    xmlns:dc="http://purl.org/dc/elements/1.1/"
  6 |    xmlns:cc="http://creativecommons.org/ns#"
  7 |    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  8 |    xmlns:svg="http://www.w3.org/2000/svg"
  9 |    xmlns="http://www.w3.org/2000/svg"
 10 |    xmlns:xlink="http://www.w3.org/1999/xlink"
 11 |    xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
 12 |    xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
 13 |    width="73.511856"
 14 |    height="107.69609"
 15 |    id="svg21949"
 16 |    version="1.1"
 17 |    inkscape:version="0.48.5 r10040"
 18 |    sodipodi:docname="breakdownlegend.svg">
 19 |   <defs
 20 |      id="defs21951" />
 21 |   <sodipodi:namedview
 22 |      id="base"
 23 |      pagecolor="#ffffff"
 24 |      bordercolor="#666666"
 25 |      borderopacity="1.0"
 26 |      inkscape:pageopacity="0.0"
 27 |      inkscape:pageshadow="2"
 28 |      inkscape:zoom="3.959798"
 29 |      inkscape:cx="-25.173796"
 30 |      inkscape:cy="36.718597"
 31 |      inkscape:document-units="px"
 32 |      inkscape:current-layer="layer1"
 33 |      showgrid="false"
 34 |      inkscape:snap-bbox="true"
 35 |      inkscape:snap-bbox-edge-midpoints="true"
 36 |      inkscape:window-width="1598"
 37 |      inkscape:window-height="862"
 38 |      inkscape:window-x="0"
 39 |      inkscape:window-y="15"
 40 |      inkscape:window-maximized="0"
 41 |      fit-margin-top="0"
 42 |      fit-margin-left="0"
 43 |      fit-margin-right="0"
 44 |      fit-margin-bottom="0" />
 45 |   <metadata
 46 |      id="metadata21954">
 47 |     <rdf:RDF>
 48 |       <cc:Work
 49 |          rdf:about="">
 50 |         <dc:format>image/svg+xml</dc:format>
 51 |         <dc:type
 52 |            rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
 53 |         <dc:title />
 54 |       </cc:Work>
 55 |     </rdf:RDF>
 56 |   </metadata>
 57 |   <g
 58 |      inkscape:label="Layer 1"
 59 |      inkscape:groupmode="layer"
 60 |      id="layer1"
 61 |      transform="translate(-295.11615,-287.87607)">
 62 |     <use
 63 |        transform="scale(1.25,1.25)"
 64 |        height="288"
 65 |        width="432"
 66 |        style="fill:#000000;fill-opacity:1"
 67 |        id="use21929"
 68 |        y="132.3584"
 69 |        x="405.28027"
 70 |        xlink:href="#glyph0-18" />
 71 |     <use
 72 |        transform="scale(1.25,1.25)"
 73 |        height="288"
 74 |        width="432"
 75 |        style="fill:#000000;fill-opacity:1"
 76 |        id="use21929-9"
 77 |        y="132.3584"
 78 |        x="405.28027"
 79 |        xlink:href="#glyph0-18" />
 80 |     <use
 81 |        transform="scale(1.25,1.25)"
 82 |        height="288"
 83 |        width="432"
 84 |        style="fill:#000000;fill-opacity:1"
 85 |        id="use21929-8"
 86 |        y="132.3584"
 87 |        x="405.28027"
 88 |        xlink:href="#glyph0-18" />
 89 |     <use
 90 |        transform="scale(1.25,1.25)"
 91 |        height="288"
 92 |        width="432"
 93 |        style="fill:#000000;fill-opacity:1"
 94 |        id="use21929-2"
 95 |        y="132.3584"
 96 |        x="405.28027"
 97 |        xlink:href="#glyph0-18" />
 98 |     <use
 99 |        transform="scale(1.25,1.25)"
100 |        height="288"
101 |        width="432"
102 |        style="fill:#000000;fill-opacity:1"
103 |        id="use21929-6"
104 |        y="132.3584"
105 |        x="405.28027"
106 |        xlink:href="#glyph0-18" />
107 |     <text
108 |        xml:space="preserve"
109 |        style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:FreeSans;-inkscape-font-specification:FreeSans"
110 |        x="295.30051"
111 |        y="296.76807"
112 |        id="text22358"
113 |        sodipodi:linespacing="125%"><tspan
114 |          sodipodi:role="line"
115 |          id="tspan22360"
116 |          x="295.30051"
117 |          y="296.76807"
118 |          style="font-weight:600;text-align:start;text-anchor:start;-inkscape-font-specification:FreeSans Semi-Bold">quantile</tspan></text>
119 |     <text
120 |        xml:space="preserve"
121 |        style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:FreeSans;-inkscape-font-specification:FreeSans"
122 |        x="338.30524"
123 |        y="323.41833"
124 |        id="text22358-6"
125 |        sodipodi:linespacing="125%"><tspan
126 |          sodipodi:role="line"
127 |          id="tspan22360-2"
128 |          x="338.30524"
129 |          y="323.41833">0-25%</tspan></text>
130 |     <text
131 |        sodipodi:linespacing="125%"
132 |        id="text22383"
133 |        y="345.0199"
134 |        x="341.75308"
135 |        style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:FreeSans;-inkscape-font-specification:FreeSans"
136 |        xml:space="preserve"><tspan
137 |          y="345.0199"
138 |          x="341.75308"
139 |          id="tspan22385"
140 |          sodipodi:role="line">25-50%</tspan></text>
141 |     <text
142 |        xml:space="preserve"
143 |        style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:FreeSans;-inkscape-font-specification:FreeSans"
144 |        x="341.74106"
145 |        y="366.62146"
146 |        id="text22387"
147 |        sodipodi:linespacing="125%"><tspan
148 |          sodipodi:role="line"
149 |          id="tspan22389"
150 |          x="341.74106"
151 |          y="366.62146">50-75%</tspan></text>
152 |     <text
153 |        sodipodi:linespacing="125%"
154 |        id="text22391"
155 |        y="388.22302"
156 |        x="344.94891"
157 |        style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:FreeSans;-inkscape-font-specification:FreeSans"
158 |        xml:space="preserve"><tspan
159 |          y="388.22302"
160 |          x="344.94891"
161 |          id="tspan22393"
162 |          sodipodi:role="line">75-100%</tspan></text>
163 |     <path
164 |        inkscape:connector-curvature="0"
165 |        style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#cccccc;stroke-width:1.32874;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1"
166 |        d="m 295.78052,330.10311 21.60156,0 0,-21.60156 -21.60156,0 z"
167 |        id="path21883" />
168 |     <path
169 |        inkscape:connector-curvature="0"
170 |        style="fill:#c1f3c1;fill-opacity:1;fill-rule:nonzero;stroke:none"
171 |        d="m 295.78052,330.10311 21.60156,0 0,-21.60156 -21.60156,0 z"
172 |        id="path21901" />
173 |     <path
174 |        inkscape:connector-curvature="0"
175 |        style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#cccccc;stroke-width:1.32874;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1"
176 |        d="m 295.78052,351.70467 21.60156,0 0,-21.60156 -21.60156,0 z"
177 |        id="path21903" />
178 |     <path
179 |        inkscape:connector-curvature="0"
180 |        style="fill:#fecece;fill-opacity:1;fill-rule:nonzero;stroke:none"
181 |        d="m 295.78052,351.70467 21.60156,0 0,-21.60156 -21.60156,0 z"
182 |        id="path21921" />
183 |     <path
184 |        id="path22491"
185 |        d="m 295.78052,373.30623 21.60156,0 0,-21.60156 -21.60156,0 z"
186 |        style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#cccccc;stroke-width:1.32874;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1"
187 |        inkscape:connector-curvature="0" />
188 |     <path
189 |        id="path22493"
190 |        d="m 295.78052,373.30623 21.60156,0 0,-21.60156 -21.60156,0 z"
191 |        style="fill:#b5def5;fill-opacity:1;fill-rule:nonzero;stroke:none"
192 |        inkscape:connector-curvature="0" />
193 |     <path
194 |        id="path22495"
195 |        d="m 295.78052,394.90779 21.60156,0 0,-21.60156 -21.60156,0 z"
196 |        style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#cccccc;stroke-width:1.32874;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1"
197 |        inkscape:connector-curvature="0" />
198 |     <path
199 |        id="path22497"
200 |        d="m 295.78052,394.90779 21.60156,0 0,-21.60156 -21.60156,0 z"
201 |        style="fill:#ffffbf;fill-opacity:1;fill-rule:nonzero;stroke:none"
202 |        inkscape:connector-curvature="0" />
203 |   </g>
204 | </svg>
205 | 


--------------------------------------------------------------------------------
/userstudy/stylusoverlay.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongardie/dissertation/0c10ea1b53d73bf6d163bdde8e4141a1cbd8f36a/userstudy/stylusoverlay.png


--------------------------------------------------------------------------------
/userstudymaterials/committed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongardie/dissertation/0c10ea1b53d73bf6d163bdde8e4141a1cbd8f36a/userstudymaterials/committed.png


--------------------------------------------------------------------------------
/userstudymaterials/inconsistency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongardie/dissertation/0c10ea1b53d73bf6d163bdde8e4141a1cbd8f36a/userstudymaterials/inconsistency.png


--------------------------------------------------------------------------------
/userstudymaterials/legala.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongardie/dissertation/0c10ea1b53d73bf6d163bdde8e4141a1cbd8f36a/userstudymaterials/legala.png


--------------------------------------------------------------------------------
/userstudymaterials/legalb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongardie/dissertation/0c10ea1b53d73bf6d163bdde8e4141a1cbd8f36a/userstudymaterials/legalb.png


--------------------------------------------------------------------------------
/userstudymaterials/legalc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongardie/dissertation/0c10ea1b53d73bf6d163bdde8e4141a1cbd8f36a/userstudymaterials/legalc.png


--------------------------------------------------------------------------------
/userstudymaterials/legald.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongardie/dissertation/0c10ea1b53d73bf6d163bdde8e4141a1cbd8f36a/userstudymaterials/legald.png


--------------------------------------------------------------------------------
/userstudymaterials/paxosLoga.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongardie/dissertation/0c10ea1b53d73bf6d163bdde8e4141a1cbd8f36a/userstudymaterials/paxosLoga.png


--------------------------------------------------------------------------------
/userstudymaterials/paxosLogb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongardie/dissertation/0c10ea1b53d73bf6d163bdde8e4141a1cbd8f36a/userstudymaterials/paxosLogb.png


--------------------------------------------------------------------------------
/userstudymaterials/paxosLogc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongardie/dissertation/0c10ea1b53d73bf6d163bdde8e4141a1cbd8f36a/userstudymaterials/paxosLogc.png


--------------------------------------------------------------------------------
/userstudymaterials/paxosLogd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongardie/dissertation/0c10ea1b53d73bf6d163bdde8e4141a1cbd8f36a/userstudymaterials/paxosLogd.png


--------------------------------------------------------------------------------
/userstudymaterials/paxossummary.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongardie/dissertation/0c10ea1b53d73bf6d163bdde8e4141a1cbd8f36a/userstudymaterials/paxossummary.pdf


--------------------------------------------------------------------------------
/userstudymaterials/raftsummary.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongardie/dissertation/0c10ea1b53d73bf6d163bdde8e4141a1cbd8f36a/userstudymaterials/raftsummary.pdf


--------------------------------------------------------------------------------