├── .gitignore ├── Makefile ├── bookgen └── generate.js ├── docs └── HACKING.md ├── generate.js ├── input ├── 0_index.md ├── 1_intro.md ├── 2_abstractions.md ├── 3_time.md ├── 4_replication.md ├── 5_eventual.md └── 6_appendix.md ├── layouts └── default │ ├── footer.html │ ├── header.html │ ├── index-insert.html │ └── single-insert.html ├── output ├── abstractions.html ├── appendix.html ├── assets │ ├── assert.css │ ├── bgnoise.png │ ├── ebook.css │ ├── jquery-1.6.1.min.js │ ├── prettify.css │ ├── prettify.js │ ├── prettify_coffee.css │ ├── printable.css │ ├── quote_colors.js │ ├── runner.js │ ├── style.css │ └── sunburst.css ├── ebook.html ├── eventual.html ├── images │ ├── CAP.png │ ├── CAP_choice.png │ ├── barroso_holzle.png │ ├── chandra_failure_detectors.png │ ├── dist-sys-cover.png │ ├── epoch.png │ ├── format_epub.png │ ├── format_html.png │ ├── format_mobi.png │ ├── format_pdf.png │ ├── git-icon.png │ ├── global-clock.png │ ├── google-transact09.png │ ├── image.png │ ├── local-clock.png │ ├── news_120.jpg │ ├── oltp_overhead.png │ ├── part-repl.png │ ├── pbs.png │ ├── replication-async.png │ ├── replication-both.png │ ├── replication-sync.png │ ├── replication.pptx │ ├── statediagram.png │ ├── system-model.png │ ├── system-of-2.png │ ├── system-of-3.png │ └── vector_clock.svg.png ├── index.html ├── intro.html ├── mixu-distributed-systems-book.epub ├── mixu-distributed-systems-book.mobi ├── replication.html ├── single-page.html └── time.html ├── package.json └── readme.md /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | build: 2 | @node generate.js 3 | 4 | ebook: 5 | @echo "\n... generating $@" 6 | ebook-convert output/ebook.html output/mixu-distributed-systems-book.mobi \ 7 | --cover ./output/images/dist-sys-cover.png \ 8 | --max-levels 0 \ 9 | --chapter "//*[@class = 'chapter']" \ 10 | --chapter-mark=none \ 11 | --sr1-search "
" \ 12 | --sr1-replace "
" \ 13 | --sr2-search "
" \ 14 | --sr2-replace "

" \ 15 | --page-breaks-before='/' \ 16 | --linearize-tables \ 17 | --authors "Mikito Takada" \ 18 | --language en \ 19 | --output-profile kindle 20 | @echo "\n... generating $@" 21 | ebook-convert output/ebook.html output/mixu-distributed-systems-book.epub \ 22 | --cover ./output/images/dist-sys-cover.png \ 23 | --max-levels 0 \ 24 | --chapter "//*[@class = 'chapter']" \ 25 | --chapter-mark=none \ 26 | --sr1-search "
" \ 27 | --sr1-replace "
" \ 28 | --sr2-search "
" \ 29 | --sr2-replace "

" \ 30 | --page-breaks-before='/' \ 31 | --linearize-tables \ 32 | --authors "Mikito Takada" \ 33 | --no-default-epub-cover \ 34 | --language en 35 | 36 | .PHONY: build ebook 37 | -------------------------------------------------------------------------------- /bookgen/generate.js: -------------------------------------------------------------------------------- 1 | var fs = require('fs'), 2 | path = require('path'), 3 | existsSync = (fs.existsSync ? fs.existsSync : path.existsSync), 4 | marked = require('marked'), 5 | util = require('util'); 6 | 7 | var header = fs.readFileSync('./layouts/default/header.html').toString(), 8 | footer = fs.readFileSync('./layouts/default/footer.html').toString(); 9 | 10 | var BookGen = function() { }; 11 | 12 | BookGen.generate = function(config) { 13 | // get all the files in input 14 | if(!Array.isArray(config.input.files)) { 15 | // iterate the path and add all files 16 | fs.readdir(config.input.files, function (err, files) { 17 | if (err) throw err; 18 | var basename = path.basename(config.input.files); 19 | BookGen.processFiles(config, files.map(function(relname) { 20 | return path.normalize( basename + '/' + relname); 21 | })); 22 | }); 23 | } else { 24 | BookGen.processFiles(config, config.input.files); 25 | } 26 | }; 27 | 28 | BookGen.processFiles = function(config, files) { 29 | if(config.input.order == 'sort') { 30 | // sort the files 31 | files.sort(); 32 | } 33 | if(config.input.index) { 34 | // move the index file first 35 | var pos = files.indexOf(config.input.index); 36 | if(pos > -1) { 37 | files.splice(pos, 1); 38 | files.unshift(config.input.index); 39 | } 40 | } 41 | 42 | files = files.filter(function(name) { 43 | return !fs.statSync(name).isDirectory(); 44 | }); 45 | // concatenate the files 46 | console.log(files); 47 | 48 | var full = files 49 | .sort(function(a, b) { return a.localeCompare(b); }) 50 | .map(function(infile, index) { 51 | // add an anchor so that the epub links work 52 | return '' + 53 | BookGen.writeFile(infile, index, config); 54 | }).join('
'); 55 | 56 | // write a single page version as well 57 | fs.writeFile(config.output+'single-page.html', 58 | header.replace('assets/style.css', 'assets/printable.css') 59 | .replace(/{{prev}}/g, 'index.html') 60 | .replace(/{{next}}/g, 'index.html') 61 | .replace('', fs.readFileSync('./layouts/default/single-insert.html').toString()) + 62 | // change links to single page format 63 | full.replace('href="index.html"', 'href="#index"') 64 | .replace('href="intro.html"', 'href="#intro"') 65 | .replace('href="abstractions.html"', 'href="#abstractions"') 66 | .replace('href="time.html"', 'href="#time"') 67 | .replace('href="replication.html"', 'href="#replication"') 68 | .replace('href="eventual.html"', 'href="#eventual"') 69 | .replace('href="appendix.html"', 'href="#appendix"') + 70 | footer 71 | .replace(/{{prev}}/g, 'index.html') 72 | .replace(/{{next}}/g, 'index.html') 73 | ); 74 | fs.writeFile(config.output+'ebook.html', 75 | header.replace(/]+>/g, '') 76 | .replace(/{{prev}}/g, 'index.html') 77 | .replace(/{{next}}/g, 'index.html') 78 | .replace('', '') 79 | .replace('', '') + 80 | // change links to single page format 81 | full.replace('href="index.html"', 'href="#index"') 82 | .replace('href="intro.html"', 'href="#intro"') 83 | .replace('href="abstractions.html"', 'href="#abstractions"') 84 | .replace('href="time.html"', 'href="#time"') 85 | .replace('href="replication.html"', 'href="#replication"') 86 | .replace('href="eventual.html"', 'href="#eventual"') 87 | .replace('href="appendix.html"', 'href="#appendix"') + 88 | footer 89 | .replace(/{{prev}}/g, 'index.html') 90 | .replace(/{{next}}/g, 'index.html') 91 | ); 92 | }; 93 | 94 | BookGen.writeFile = function(infile, index, config) { 95 | 96 | console.log(infile) 97 | var tokens = marked.lexer(fs.readFileSync(infile).toString()); 98 | var content = marked 99 | .parser(tokens) 100 | .replace(/<(ul|ol)>/g, '<$1 class="list">') 101 | .replace(/
]*>([\s\S]*?)<\/code><\/pre>/mg, '
$1
') 102 | .replace(/

]*)>\s*<\/p>/g, '

') 103 | .replace(/%chapter_number%\.?/g, index+'.'); 104 | 105 | var links = { 106 | 'index': { prev: 'index.html', next: 'intro.html' }, 107 | 'intro': { prev: 'index.html', next: 'abstractions.html' }, 108 | 'abstractions': { prev: 'intro.html', next: 'time.html' }, 109 | 'time': { prev: 'abstractions.html', next: 'replication.html' }, 110 | 'replication': { prev: 'time.html', next: 'eventual.html' }, 111 | 'eventual': { prev: 'replication.html', next: 'appendix.html' }, 112 | 'appendix': { prev: 'eventual.html', next: 'appendix.html' }, 113 | }; 114 | 115 | 116 | // replace until the first alpha character 117 | var outName = path.basename(infile, '.md').replace(/^[^a-z]*/, ''); 118 | 119 | fs.writeFileSync(config.output + outName + '.html', 120 | header 121 | .replace(/{{title}}/g, config.titles[outName +'.md' ] || 'Distributed systems for fun and profit') 122 | .replace(/{{prev}}/g, (links[outName] ? links[outName].prev : '')) 123 | .replace(/{{next}}/g, (links[outName] ? links[outName].next : '')) 124 | // special download header 125 | .replace('', (outName == 'index' ? fs.readFileSync('./layouts/default/index-insert.html') : '')) + 126 | content + 127 | footer 128 | .replace(/{{prev}}/g, (links[outName] ? links[outName].prev : '')) 129 | .replace(/{{next}}/g, (links[outName] ? links[outName].next : '')) 130 | ); 131 | 132 | prev = outName+'.html'; 133 | 134 | return content; 135 | }; 136 | 137 | module.exports = BookGen; 138 | -------------------------------------------------------------------------------- /docs/HACKING.md: -------------------------------------------------------------------------------- 1 | Hacking 2 | ======= 3 | 4 | The build system for the book requires two things: 5 | 6 | * NodeJS (available from http://www.nodejs.org/) 7 | 8 | * The `marked` module for NodeJS: `npm install marked --save` 9 | 10 | To build, simply execute `make` in the project's root directory. 11 | -------------------------------------------------------------------------------- /generate.js: -------------------------------------------------------------------------------- 1 | var BookGen = require('./bookgen/generate.js'); 2 | 3 | var config = { 4 | output: __dirname + '/output/', 5 | 6 | input: { 7 | 8 | dir: __dirname + '/input/', 9 | 10 | files: __dirname + '/input/', 11 | 12 | // specify exact order later on, when single page v is generated 13 | 14 | index: 'index.html' 15 | }, 16 | 17 | titles: { 18 | }, 19 | 20 | layout: __dirname + '/layouts/default/' 21 | 22 | }; 23 | 24 | BookGen.generate(config); 25 | -------------------------------------------------------------------------------- /input/0_index.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | I wanted a text that would bring together the ideas behind many of the more recent distributed systems - systems such as Amazon's Dynamo, Google's BigTable and MapReduce, Apache's Hadoop and so on. 4 | 5 | In this text I've tried to provide a more accessible introduction to distributed systems. To me, that means two things: introducing the key concepts that you will need in order to [have a good time](https://www.google.com/search?q=super+cool+ski+instructor) reading more serious texts, and providing a narrative that covers things in enough detail that you get a gist of what's going on without getting stuck on details. It's 2013, you've got the Internet, and you can selectively read more about the topics you find most interesting. 6 | 7 | In my view, much of distributed programming is about dealing with the implications of two consequences of distribution: 8 | 9 | - that information travels at the speed of light 10 | - that independent things fail independently* 11 | 12 | In other words, that the core of distributed programming is dealing with distance (duh!) and having more than one thing (duh!). These constraints define a space of possible system designs, and my hope is that after reading this you'll have a better sense of how distance, time and consistency models interact. 13 | 14 | This text is focused on distributed programming and systems concepts you'll need to understand commercial systems in the data center. It would be madness to attempt to cover everything. You'll learn many key protocols and algorithms (covering, for example, many of the most cited papers in the discipline), including some new exciting ways to look at eventual consistency that haven't still made it into college textbooks - such as CRDTs and the CALM theorem. 15 | 16 | I hope you like it! If you want to say thanks, follow me on [Github](https://github.com/mixu/) (or [Twitter](http://twitter.com/mikitotakada)). And if you spot an error, [file a pull request on Github](https://github.com/mixu/distsysbook/issues). 17 | 18 | --- 19 | 20 | # 1. Basics 21 | 22 | [The first chapter](intro.html) covers distributed systems at a high level by introducing a number of important terms and concepts. It covers high level goals, such as scalability, availability, performance, latency and fault tolerance; how those are hard to achieve, and how abstractions and models as well as partitioning and replication come into play. 23 | 24 | # 2. Up and down the level of abstraction 25 | 26 | [The second chapter](abstractions.html) dives deeper into abstractions and impossibility results. It starts with a Nietzsche quote, and then introduces system models and the many assumptions that are made in a typical system model. It then discusses the CAP theorem and summarizes the FLP impossibility result. It then turns to the implications of the CAP theorem, one of which is that one ought to explore other consistency models. A number of consistency models are then discussed. 27 | 28 | # 3. Time and order 29 | 30 | A big part of understanding distributed systems is about understanding time and order. To the extent that we fail to understand and model time, our systems will fail. [The third chapter](time.html) discusses time and order, and clocks as well as the various uses of time, order and clocks (such as vector clocks and failure detectors). 31 | 32 | # 4. Replication: preventing divergence 33 | 34 | The [fourth chapter](replication.html) introduces the replication problem, and the two basic ways in which it can be performed. It turns out that most of the relevant characteristics can be discussed with just this simple characterization. Then, replication methods for maintaining single-copy consistency are discussed from the least fault tolerant (2PC) to Paxos. 35 | 36 | # 5. Replication: accepting divergence 37 | 38 | The [fifth chapter](eventual.html) discussed replication with weak consistency guarantees. It introduces a basic reconciliation scenario, where partitioned replicas attempt to reach agreement. It then discusses Amazon's Dynamo as an example of a system design with weak consistency guarantees. Finally, two perspectives on disorderly programming are discussed: CRDTs and the CALM theorem. 39 | 40 | # 6. Appendix 41 | 42 | [The appendix](appendix.html) covers recommendations for further reading. 43 | 44 | --- 45 | 46 |

*: This is a [lie](http://en.wikipedia.org/wiki/Statistical_independence). [This post by Jay Kreps elaborates](http://blog.empathybox.com/post/19574936361/getting-real-about-distributed-system-reliability). 47 |

48 | -------------------------------------------------------------------------------- /input/1_intro.md: -------------------------------------------------------------------------------- 1 | # %chapter_number%. Distributed systems at a high level 2 | 3 | > Distributed programming is the art of solving the same problem that you can solve on a single computer using multiple computers. 4 | 5 | There are two basic tasks that any computer system needs to accomplish: 6 | 7 | - storage and 8 | - computation 9 | 10 | Distributed programming is the art of solving the same problem that you can solve on a single computer using multiple computers - usually, because the problem no longer fits on a single computer. 11 | 12 | Nothing really demands that you use distributed systems. Given infinite money and infinite R&D time, we wouldn't need distributed systems. All computation and storage could be done on a magic box - a single, incredibly fast and incredibly reliable system *that you pay someone else to design for you*. 13 | 14 | However, few people have infinite resources. Hence, they have to find the right place on some real-world cost-benefit curve. At a small scale, upgrading hardware is a viable strategy. However, as problem sizes increase you will reach a point where either the hardware upgrade that allows you to solve the problem on a single node does not exist, or becomes cost-prohibitive. At that point, I welcome you to the world of distributed systems. 15 | 16 | It is a current reality that the best value is in mid-range, commodity hardware - as long as the maintenance costs can be kept down through fault-tolerant software. 17 | 18 | Computations primarily benefit from high-end hardware to the extent to which they can replace slow network accesses with internal memory accesses. The performance advantage of high-end hardware is limited in tasks that require large amounts of communication between nodes. 19 | 20 | ![cost-efficiency](images/barroso_holzle.png) 21 | 22 | As the figure above from [Barroso, Clidaras & Hölzle](http://www.morganclaypool.com/doi/abs/10.2200/S00516ED2V01Y201306CAC024) shows, the performance gap between high-end and commodity hardware decreases with cluster size assuming a uniform memory access pattern across all nodes. 23 | 24 | Ideally, adding a new machine would increase the performance and capacity of the system linearly. But of course this is not possible, because there is some overhead that arises due to having separate computers. Data needs to be copied around, computation tasks have to be coordinated and so on. This is why it's worthwhile to study distributed algorithms - they provide efficient solutions to specific problems, as well as guidance about what is possible, what the minimum cost of a correct implementation is, and what is impossible. 25 | 26 | The focus of this text is on distributed programming and systems in a mundane, but commercially relevant setting: the data center. For example, I will not discuss specialized problems that arise from having an exotic network configuration, or that arise in a shared-memory setting. Additionally, the focus is on exploring the system design space rather than on optimizing any specific design - the latter is a topic for a much more specialized text. 27 | 28 | ## What we want to achieve: Scalability and other good things 29 | 30 | The way I see it, everything starts with the need to deal with size. 31 | 32 | Most things are trivial at a small scale - and the same problem becomes much harder once you surpass a certain size, volume or other physically constrained thing. It's easy to lift a piece of chocolate, it's hard to lift a mountain. It's easy to count how many people are in a room, and hard to count how many people are in a country. 33 | 34 | So everything starts with size - scalability. Informally speaking, in a scalable system as we move from small to large, things should not get incrementally worse. Here's another definition: 35 | 36 |
37 |
[Scalability](http://en.wikipedia.org/wiki/Scalability)
38 |
is the ability of a system, network, or process, to handle a growing amount of work in a capable manner or its ability to be enlarged to accommodate that growth.
39 |
40 | 41 | What is it that is growing? Well, you can measure growth in almost any terms (number of people, electricity usage etc.). But there are three particularly interesting things to look at: 42 | 43 | - Size scalability: adding more nodes should make the system linearly faster; growing the dataset should not increase latency 44 | - Geographic scalability: it should be possible to use multiple data centers to reduce the time it takes to respond to user queries, while dealing with cross-data center latency in some sensible manner. 45 | - Administrative scalability: adding more nodes should not increase the administrative costs of the system (e.g. the administrators-to-machines ratio). 46 | 47 | Of course, in a real system growth occurs on multiple different axes simultaneously; each metric captures just some aspect of growth. 48 | 49 | A scalable system is one that continues to meet the needs of its users as scale increases. There are two particularly relevant aspects - performance and availability - which can be measured in various ways. 50 | 51 | ### Performance (and latency) 52 | 53 |
54 |
[Performance](http://en.wikipedia.org/wiki/Computer_performance)
55 |
is characterized by the amount of useful work accomplished by a computer system compared to the time and resources used.
56 |
57 | 58 | Depending on the context, this may involve achieving one or more of the following: 59 | 60 | - Short response time/low latency for a given piece of work 61 | - High throughput (rate of processing work) 62 | - Low utilization of computing resource(s) 63 | 64 | There are tradeoffs involved in optimizing for any of these outcomes. For example, a system may achieve a higher throughput by processing larger batches of work thereby reducing operation overhead. The tradeoff would be longer response times for individual pieces of work due to batching. 65 | 66 | I find that low latency - achieving a short response time - is the most interesting aspect of performance, because it has a strong connection with physical (rather than financial) limitations. It is harder to address latency using financial resources than the other aspects of performance. 67 | 68 | There are a lot of really specific definitions for latency, but I really like the idea that the etymology of the word evokes: 69 | 70 |
71 |
Latency
72 |
The state of being latent; delay, a period between the initiation of something and the occurrence.
73 |
74 | 75 | And what does it mean to be "latent"? 76 | 77 |
78 |
Latent
79 |
From Latin latens, latentis, present participle of lateo ("lie hidden"). Existing or present but concealed or inactive.
80 |
81 | 82 | This definition is pretty cool, because it highlights how latency is really the time between when something happened and the time it has an impact or becomes visible. 83 | 84 | For example, imagine that you are infected with an airborne virus that turns people into zombies. The latent period is the time between when you became infected, and when you turn into a zombie. That's latency: the time during which something that has already happened is concealed from view. 85 | 86 | Let's assume for a moment that our distributed system does just one high-level task: given a query, it takes all of the data in the system and calculates a single result. In other words, think of a distributed system as a data store with the ability to run a single deterministic computation (function) over its current content: 87 | 88 | `result = query(all data in the system)` 89 | 90 | Then, what matters for latency is not the amount of old data, but rather the speed at which new data "takes effect" in the system. For example, latency could be measured in terms of how long it takes for a write to become visible to readers. 91 | 92 | The other key point based on this definition is that if nothing happens, there is no "latent period". A system in which data doesn't change doesn't (or shouldn't) have a latency problem. 93 | 94 | In a distributed system, there is a minimum latency that cannot be overcome: the speed of light limits how fast information can travel, and hardware components have a minimum latency cost incurred per operation (think RAM and hard drives but also CPUs). 95 | 96 | How much that minimum latency impacts your queries depends on the nature of those queries and the physical distance the information needs to travel. 97 | 98 | ### Availability (and fault tolerance) 99 | 100 | The second aspect of a scalable system is availability. 101 | 102 |
103 |
[Availability](http://en.wikipedia.org/wiki/High_availability)
104 |
the proportion of time a system is in a functioning condition. If a user cannot access the system, it is said to be unavailable.
105 |
106 | 107 | Distributed systems allow us to achieve desirable characteristics that would be hard to accomplish on a single system. For example, a single machine cannot tolerate any failures since it either fails or doesn't. 108 | 109 | Distributed systems can take a bunch of unreliable components, and build a reliable system on top of them. 110 | 111 | Systems that have no redundancy can only be as available as their underlying components. Systems built with redundancy can be tolerant of partial failures and thus be more available. It is worth noting that "redundant" can mean different things depending on what you look at - components, servers, datacenters and so on. 112 | 113 | Formulaically, availability is: `Availability = uptime / (uptime + downtime)`. 114 | 115 | Availability from a technical perspective is mostly about being fault tolerant. Because the probability of a failure occurring increases with the number of components, the system should be able to compensate so as to not become less reliable as the number of components increases. 116 | 117 | For example: 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 |
Availability %How much downtime is allowed per year?
90% ("one nine")More than a month
99% ("two nines")Less than 4 days
99.9% ("three nines")Less than 9 hours
99.99% ("four nines")Less than an hour
99.999% ("five nines")~ 5 minutes
99.9999% ("six nines")~ 31 seconds
149 | 150 | 151 | Availability is in some sense a much wider concept than uptime, since the availability of a service can also be affected by, say, a network outage or the company owning the service going out of business (which would be a factor which is not really relevant to fault tolerance but would still influence the availability of the system). But without knowing every single specific aspect of the system, the best we can do is design for fault tolerance. 152 | 153 | What does it mean to be fault tolerant? 154 | 155 |
156 |
Fault tolerance
157 |
ability of a system to behave in a well-defined manner once faults occur
158 |
159 | 160 | Fault tolerance boils down to this: define what faults you expect and then design a system or an algorithm that is tolerant of them. You can't tolerate faults you haven't considered. 161 | 162 | ## What prevents us from achieving good things? 163 | 164 | Distributed systems are constrained by two physical factors: 165 | 166 | - the number of nodes (which increases with the required storage and computation capacity) 167 | - the distance between nodes (information travels, at best, at the speed of light) 168 | 169 | Working within those constraints: 170 | 171 | - an increase in the number of independent nodes increases the probability of failure in a system (reducing availability and increasing administrative costs) 172 | - an increase in the number of independent nodes may increase the need for communication between nodes (reducing performance as scale increases) 173 | - an increase in geographic distance increases the minimum latency for communication between distant nodes (reducing performance for certain operations) 174 | 175 | Beyond these tendencies - which are a result of the physical constraints - is the world of system design options. 176 | 177 | Both performance and availability are defined by the external guarantees the system makes. On a high level, you can think of the guarantees as the SLA (service level agreement) for the system: if I write data, how quickly can I access it elsewhere? After the data is written, what guarantees do I have of durability? If I ask the system to run a computation, how quickly will it return results? When components fail, or are taken out of operation, what impact will this have on the system? 178 | 179 | There is another criterion, which is not explicitly mentioned but implied: intelligibility. How understandable are the guarantees that are made? Of course, there are no simple metrics for what is intelligible. 180 | 181 | I was kind of tempted to put "intelligibility" under physical limitations. After all, it is a hardware limitation in people that we have a hard time understanding anything that involves [more moving things than we have fingers](http://en.wikipedia.org/wiki/Working_memory#Capacity). That's the difference between an error and an anomaly - an error is incorrect behavior, while an anomaly is unexpected behavior. If you were smarter, you'd expect the anomalies to occur. 182 | 183 | ## Abstractions and models 184 | 185 | This is where abstractions and models come into play. Abstractions make things more manageable by removing real-world aspects that are not relevant to solving a problem. Models describe the key properties of a distributed system in a precise manner. I'll discuss many kinds of models in the next chapter, such as: 186 | 187 | - System model (asynchronous / synchronous) 188 | - Failure model (crash-fail, partitions, Byzantine) 189 | - Consistency model (strong, eventual) 190 | 191 | A good abstraction makes working with a system easier to understand, while capturing the factors that are relevant for a particular purpose. 192 | 193 | There is a tension between the reality that there are many nodes and with our desire for systems that "work like a single system". Often, the most familiar model (for example, implementing a shared memory abstraction on a distributed system) is too expensive. 194 | 195 | A system that makes weaker guarantees has more freedom of action, and hence potentially greater performance - but it is also potentially hard to reason about. People are better at reasoning about systems that work like a single system, rather than a collection of nodes. 196 | 197 | One can often gain performance by exposing more details about the internals of the system. For example, in [columnar storage](http://en.wikipedia.org/wiki/Column-oriented_DBMS), the user can (to some extent) reason about the locality of the key-value pairs within the system and hence make decisions that influence the performance of typical queries. Systems which hide these kinds of details are easier to understand (since they act more like single unit, with fewer details to think about), while systems that expose more real-world details may be more performant (because they correspond more closely to reality). 198 | 199 | Several types of failures make writing distributed systems that act like a single system difficult. Network latency and network partitions (e.g. total network failure between some nodes) mean that a system needs to sometimes make hard choices about whether it is better to stay available but lose some crucial guarantees that cannot be enforced, or to play it safe and refuse clients when these types of failures occur. 200 | 201 | The CAP theorem - which I will discuss in the next chapter - captures some of these tensions. In the end, the ideal system meets both programmer needs (clean semantics) and business needs (availability/consistency/latency). 202 | 203 | ## Design techniques: partition and replicate 204 | 205 | The manner in which a data set is distributed between multiple nodes is very important. In order for any computation to happen, we need to locate the data and then act on it. 206 | 207 | There are two basic techniques that can be applied to a data set. It can be split over multiple nodes (partitioning) to allow for more parallel processing. It can also be copied or cached on different nodes to reduce the distance between the client and the server and for greater fault tolerance (replication). 208 | 209 | > Divide and conquer - I mean, partition and replicate. 210 | 211 | The picture below illustrates the difference between these two: partitioned data (A and B below) is divided into independent sets, while replicated data (C below) is copied to multiple locations. 212 | 213 | ![Partition and replicate](images/part-repl.png) 214 | 215 | This is the one-two punch for solving any problem where distributed computing plays a role. Of course, the trick is in picking the right technique for your concrete implementation; there are many algorithms that implement replication and partitioning, each with different limitations and advantages which need to be assessed against your design objectives. 216 | 217 | ### Partitioning 218 | 219 | Partitioning is dividing the dataset into smaller distinct independent sets; this is used to reduce the impact of dataset growth since each partition is a subset of the data. 220 | 221 | - Partitioning improves performance by limiting the amount of data to be examined and by locating related data in the same partition 222 | - Partitioning improves availability by allowing partitions to fail independently, increasing the number of nodes that need to fail before availability is sacrificed 223 | 224 | Partitioning is also very much application-specific, so it is hard to say much about it without knowing the specifics. That's why the focus is on replication in most texts, including this one. 225 | 226 | Partitioning is mostly about defining your partitions based on what you think the primary access pattern will be, and dealing with the limitations that come from having independent partitions (e.g. inefficient access across partitions, different rate of growth etc.). 227 | 228 | ### Replication 229 | 230 | Replication is making copies of the same data on multiple machines; this allows more servers to take part in the computation. 231 | 232 | Let me inaccurately quote [Homer J. Simpson](http://en.wikipedia.org/wiki/Homer_vs._the_Eighteenth_Amendment): 233 | 234 | > To replication! The cause of, and solution to all of life's problems. 235 | 236 | Replication - copying or reproducing something - is the primary way in which we can fight latency. 237 | 238 | - Replication improves performance by making additional computing power and bandwidth applicable to a new copy of the data 239 | - Replication improves availability by creating additional copies of the data, increasing the number of nodes that need to fail before availability is sacrificed 240 | 241 | Replication is about providing extra bandwidth, and caching where it counts. It is also about maintaining consistency in some way according to some consistency model. 242 | 243 | Replication allows us to achieve scalability, performance and fault tolerance. Afraid of loss of availability or reduced performance? Replicate the data to avoid a bottleneck or single point of failure. Slow computation? Replicate the computation on multiple systems. Slow I/O? Replicate the data to a local cache to reduce latency or onto multiple machines to increase throughput. 244 | 245 | Replication is also the source of many of the problems, since there are now independent copies of the data that has to be kept in sync on multiple machines - this means ensuring that the replication follows a consistency model. 246 | 247 | The choice of a consistency model is crucial: a good consistency model provides clean semantics for programmers (in other words, the properties it guarantees are easy to reason about) and meets business/design goals such as high availability or strong consistency. 248 | 249 | Only one consistency model for replication - strong consistency - allows you to program as-if the underlying data was not replicated. Other consistency models expose some internals of the replication to the programmer. However, weaker consistency models can provide lower latency and higher availability - and are not necessarily harder to understand, just different. 250 | 251 | --- 252 | 253 | ## Further reading 254 | 255 | - [The Datacenter as a Computer - An Introduction to the Design of Warehouse-Scale Machines](http://www.morganclaypool.com/doi/pdf/10.2200/s00193ed1v01y200905cac006) - Barroso & Hölzle, 2008 256 | - [Fallacies of Distributed Computing](http://en.wikipedia.org/wiki/Fallacies_of_Distributed_Computing) 257 | - [Notes on Distributed Systems for Young Bloods](http://www.somethingsimilar.com/2013/01/14/notes-on-distributed-systems-for-young-bloods/) - Hodges, 2013 258 | -------------------------------------------------------------------------------- /input/2_abstractions.md: -------------------------------------------------------------------------------- 1 | # %chapter_number%. Up and down the level of abstraction 2 | 3 | In this chapter, we'll travel up and down the level of abstraction, look at some impossibility results (CAP and FLP), and then travel back down for the sake of performance. 4 | 5 | If you've done any programming, the idea of levels of abstraction is probably familiar to you. You'll always work at some level of abstraction, interface with a lower level layer through some API, and probably provide some higher-level API or user interface to your users. The seven-layer [OSI model of computer networking](http://en.wikipedia.org/wiki/OSI_model) is a good example of this. 6 | 7 | Distributed programming is, I'd assert, in large part dealing with consequences of distribution (duh!). That is, there is a tension between the reality that there are many nodes and with our desire for systems that "work like a single system". That means finding a good abstraction that balances what is possible with what is understandable and performant. 8 | 9 | What do we mean when say X is more abstract than Y? First, that X does not introduce anything new or fundamentally different from Y. In fact, X may remove some aspects of Y or present them in a way that makes them more manageable. 10 | Second, that X is in some sense easier to grasp than Y, assuming that the things that X removed from Y are not important to the matter at hand. 11 | 12 | As [Nietzsche](http://oregonstate.edu/instruct/phl201/modules/Philosophers/Nietzsche/Truth_and_Lie_in_an_Extra-Moral_Sense.htm) wrote: 13 | 14 | > Every concept originates through our equating what is unequal. No leaf ever wholly equals another, and the concept "leaf" is formed through an arbitrary abstraction from these individual differences, through forgetting the distinctions; and now it gives rise to the idea that in nature there might be something besides the leaves which would be "leaf" - some kind of original form after which all leaves have been woven, marked, copied, colored, curled, and painted, but by unskilled hands, so that no copy turned out to be a correct, reliable, and faithful image of the original form. 15 | 16 | Abstractions, fundamentally, are fake. Every situation is unique, as is every node. But abstractions make the world manageable: simpler problem statements - free of reality - are much more analytically tractable and provided that we did not ignore anything essential, the solutions are widely applicable. 17 | 18 | Indeed, if the things that we kept around are essential, then the results we can derive will be widely applicable. This is why impossibility results are so important: they take the simplest possible formulation of a problem, and demonstrate that it is impossible to solve within some set of constraints or assumptions. 19 | 20 | All abstractions ignore something in favor of equating things that are in reality unique. The trick is to get rid of everything that is not essential. How do you know what is essential? Well, you probably won't know a priori. 21 | 22 | Every time we exclude some aspect of a system from our specification of the system, we risk introducing a source of error and/or a performance issue. That's why sometimes we need to go in the other direction, and selectively introduce some aspects of real hardware and the real-world problem back. It may be sufficient to reintroduce some specific hardware characteristics (e.g. physical sequentiality) or other physical characteristics to get a system that performs well enough. 23 | 24 | With this in mind, what is the least amount of reality we can keep around while still working with something that is still recognizable as a distributed system? A system model is a specification of the characteristics we consider important; having specified one, we can then take a look at some impossibility results and challenges. 25 | 26 | ## A system model 27 | 28 | A key property of distributed systems is distribution. More specifically, programs in a distributed system: 29 | 30 | - run concurrently on independent nodes ... 31 | - are connected by a network that may introduce nondeterminism and message loss ... 32 | - and have no shared memory or shared clock. 33 | 34 | There are many implications: 35 | 36 | - each node executes a program concurrently 37 | - knowledge is local: nodes have fast access only to their local state, and any information about global state is potentially out of date 38 | - nodes can fail and recover from failure independently 39 | - messages can be delayed or lost (independent of node failure; it is not easy to distinguish network failure and node failure) 40 | - and clocks are not synchronized across nodes (local timestamps do not correspond to the global real time order, which cannot be easily observed) 41 | 42 | A system model enumerates the many assumptions associated with a particular system design. 43 | 44 |
45 |
System model
46 |
a set of assumptions about the environment and facilities on which a distributed system is implemented
47 |
48 | 49 | System models vary in their assumptions about the environment and facilities. These assumptions include: 50 | 51 | - what capabilities the nodes have and how they may fail 52 | - how communication links operate and how they may fail and 53 | - properties of the overall system, such as assumptions about time and order 54 | 55 | A robust system model is one that makes the weakest assumptions: any algorithm written for such a system is very tolerant of different environments, since it makes very few and very weak assumptions. 56 | 57 | On the other hand, we can create a system model that is easy to reason about by making strong assumptions. For example, assuming that nodes do not fail means that our algorithm does not need to handle node failures. However, such a system model is unrealistic and hence hard to apply into practice. 58 | 59 | Let's look at the properties of nodes, links and time and order in more detail. 60 | 61 | ### Nodes in our system model 62 | 63 | Nodes serve as hosts for computation and storage. They have: 64 | 65 | - the ability to execute a program 66 | - the ability to store data into volatile memory (which can be lost upon failure) and into stable state (which can be read after a failure) 67 | - a clock (which may or may not be assumed to be accurate) 68 | 69 | Nodes execute deterministic algorithms: the local computation, the local state after the computation, and the messages sent are determined uniquely by the message received and local state when the message was received. 70 | 71 | There are many possible failure models which describe the ways in which nodes can fail. In practice, most systems assume a crash-recovery failure model: that is, nodes can only fail by crashing, and can (possibly) recover after crashing at some later point. 72 | 73 | Another alternative is to assume that nodes can fail by misbehaving in any arbitrary way. This is known as [Byzantine fault tolerance](http://en.wikipedia.org/wiki/Byzantine_fault_tolerance). Byzantine faults are rarely handled in real world commercial systems, because algorithms resilient to arbitrary faults are more expensive to run and more complex to implement. I will not discuss them here. 74 | 75 | ### Communication links in our system model 76 | 77 | Communication links connect individual nodes to each other, and allow messages to be sent in either direction. Many books that discuss distributed algorithms assume that there are individual links between each pair of nodes, that the links provide FIFO (first in, first out) order for messages, that they can only deliver messages that were sent, and that sent messages can be lost. 78 | 79 | Some algorithms assume that the network is reliable: that messages are never lost and never delayed indefinitely. This may be a reasonable assumption for some real-world settings, but in general it is preferable to consider the network to be unreliable and subject to message loss and delays. 80 | 81 | A network partition occurs when the network fails while the nodes themselves remain operational. When this occurs, messages may be lost or delayed until the network partition is repaired. Partitioned nodes may be accessible by some clients, and so must be treated differently from crashed nodes. The diagram below illustrates a node failure vs. a network partition: 82 | 83 | replication 84 | 85 | It is rare to make further assumptions about communication links. We could assume that links only work in one direction, or we could introduce different communication costs (e.g. latency due to physical distance) for different links. However, these are rarely concerns in commercial environments except for long-distance links (WAN latency) and so I will not discuss them here; a more detailed model of costs and topology allows for better optimization at the cost of complexity. 86 | 87 | ### Timing / ordering assumptions 88 | 89 | One of the consequences of physical distribution is that each node experiences the world in a unique manner. This is inescapable, because information can only travel at the speed of light. If nodes are at different distances from each other, then any messages sent from one node to the others will arrive at a different time and potentially in a different order at the other nodes. 90 | 91 | Timing assumptions are a convenient shorthand for capturing assumptions about the extent to which we take this reality into account. The two main alternatives are: 92 | 93 |
94 |
Synchronous system model
95 |
Processes execute in lock-step; there is a known upper bound on message transmission delay; each process has an accurate clock
96 |
Asynchronous system model
97 |
No timing assumptions - e.g. processes execute at independent rates; there is no bound on message transmission delay; useful clocks do not exist
98 |
99 | 100 | The synchronous system model imposes many constraints on time and order. It essentially assumes that the nodes have the same experience: that messages that are sent are always received within a particular maximum transmission delay, and that processes execute in lock-step. This is convenient, because it allows you as the system designer to make assumptions about time and order, while the asynchronous system model doesn't. 101 | 102 | Asynchronicity is a non-assumption: it just assumes that you can't rely on timing (or a "time sensor"). 103 | 104 | It is easier to solve problems in the synchronous system model, because assumptions about execution speeds, maximum message transmission delays and clock accuracy all help in solving problems since you can make inferences based on those assumptions and rule out inconvenient failure scenarios by assuming they never occur. 105 | 106 | Of course, assuming the synchronous system model is not particularly realistic. Real-world networks are subject to failures and there are no hard bounds on message delay. Real world systems are at best partially synchronous: they may occasionally work correctly and provide some upper bounds, but there will be times where messages are delayed indefinitely and clocks are out of sync. I won't really discuss algorithms for synchronous systems here, but you will probably run into them in many other introductory books because they are analytically easier (but unrealistic). 107 | 108 | 109 | ### The consensus problem 110 | 111 | During the rest of this text, we'll vary the parameters of the system model. Next, we'll look at how varying two system properties: 112 | 113 | - whether or not network partitions are included in the failure model, and 114 | - synchronous vs. asynchronous timing assumptions 115 | 116 | influence the system design choices by discussing two impossibility results (FLP and CAP). 117 | 118 | Of course, in order to have a discussion, we also need to introduce a problem to solve. The problem I'm going to discuss is the [consensus problem](http://en.wikipedia.org/wiki/Consensus_%28computer_science%29). 119 | 120 | Several computers (or nodes) achieve consensus if they all agree on some value. More formally: 121 | 122 | 1. Agreement: Every correct process must agree on the same value. 123 | 2. Integrity: Every correct process decides at most one value, and if it decides some value, then it must have been proposed by some process. 124 | 3. Termination: All processes eventually reach a decision. 125 | 4. Validity: If all correct processes propose the same value V, then all correct processes decide V. 126 | 127 | The consensus problem is at the core of many commercial distributed systems. After all, we want the reliability and performance of a distributed system without having to deal with the consequences of distribution (e.g. disagreements / divergence between nodes), and solving the consensus problem makes it possible to solve several related, more advanced problems such as atomic broadcast and atomic commit. 128 | 129 | ### Two impossibility results 130 | 131 | The first impossibility result, known as the FLP impossibility result, is an impossibility result that is particularly relevant to people who design distributed algorithms. The second - the CAP theorem - is a related result that is more relevant to practitioners; people who need to choose between different system designs but who are not directly concerned with the design of algorithms. 132 | 133 | ## The FLP impossibility result 134 | 135 | I will only briefly summarize the [FLP impossibility result](http://en.wikipedia.org/wiki/Consensus_%28computer_science%29#Solvability_results_for_some_agreement_problems), though it is considered to be [more important](http://en.wikipedia.org/wiki/Dijkstra_Prize) in academic circles. The FLP impossibility result (named after the authors, Fischer, Lynch and Patterson) examines the consensus problem under the asynchronous system model (technically, the agreement problem, which is a very weak form of the consensus problem). It is assumed that nodes can only fail by crashing; that the network is reliable, and that the typical timing assumptions of the asynchronous system model hold: e.g. there are no bounds on message delay. 136 | 137 | Under these assumptions, the FLP result states that "there does not exist a (deterministic) algorithm for the consensus problem in an asynchronous system subject to failures, even if messages can never be lost, at most one process may fail, and it can only fail by crashing (stopping executing)". 138 | 139 | This result means that there is no way to solve the consensus problem under a very minimal system model in a way that cannot be delayed forever. The argument is that if such an algorithm existed, then one could devise an execution of that algorithm in which it would remain undecided ("bivalent") for an arbitrary amount of time by delaying message delivery - which is allowed in the asynchronous system model. Thus, such an algorithm cannot exist. 140 | 141 | This impossibility result is important because it highlights that assuming the asynchronous system model leads to a tradeoff: algorithms that solve the consensus problem must either give up safety or liveness when the guarantees regarding bounds on message delivery do not hold. 142 | 143 | This insight is particularly relevant to people who design algorithms, because it imposes a hard constraint on the problems that we know are solvable in the asynchronous system model. The CAP theorem is a related theorem that is more relevant to practitioners: it makes slightly different assumptions (network failures rather than node failures), and has more clear implications for practitioners choosing between system designs. 144 | 145 | ## The CAP theorem 146 | 147 | The CAP theorem was initially a conjecture made by computer scientist Eric Brewer. It's a popular and fairly useful way to think about tradeoffs in the guarantees that a system design makes. It even has a [formal proof](http://www.comp.nus.edu.sg/~gilbert/pubs/BrewersConjecture-SigAct.pdf) by [Gilbert](http://www.comp.nus.edu.sg/~gilbert/biblio.html) and [Lynch](http://en.wikipedia.org/wiki/Nancy_Lynch) and no, [Nathan Marz](http://nathanmarz.com/) didn't debunk it, in spite of what [a particular discussion site](http://news.ycombinator.com/) thinks. 148 | 149 | The theorem states that of these three properties: 150 | 151 | - Consistency: all nodes see the same data at the same time. 152 | - Availability: node failures do not prevent survivors from continuing to operate. 153 | - Partition tolerance: the system continues to operate despite message loss due to network and/or node failure 154 | 155 | only two can be satisfied simultaneously. We can even draw this as a pretty diagram, picking two properties out of three gives us three types of systems that correspond to different intersections: 156 | 157 | ![CAP theorem](images/CAP.png) 158 | 159 | Note that the theorem states that the middle piece (having all three properties) is not achievable. Then we get three different system types: 160 | 161 | - CA (consistency + availability). Examples include full strict quorum protocols, such as two-phase commit. 162 | - CP (consistency + partition tolerance). Examples include majority quorum protocols in which minority partitions are unavailable such as Paxos. 163 | - AP (availability + partition tolerance). Examples include protocols using conflict resolution, such as Dynamo. 164 | 165 | The CA and CP system designs both offer the same consistency model: strong consistency. The only difference is that a CA system cannot tolerate any node failures; a CP system can tolerate up to `f` faults given `2f+1` nodes in a non-Byzantine failure model (in other words, it can tolerate the failure of a minority `f` of the nodes as long as majority `f+1` stays up). The reason is simple: 166 | 167 | - A CA system does not distinguish between node failures and network failures, and hence must stop accepting writes everywhere to avoid introducing divergence (multiple copies). It cannot tell whether a remote node is down, or whether just the network connection is down: so the only safe thing is to stop accepting writes. 168 | - A CP system prevents divergence (e.g. maintains single-copy consistency) by forcing asymmetric behavior on the two sides of the partition. It only keeps the majority partition around, and requires the minority partition to become unavailable (e.g. stop accepting writes), which retains a degree of availability (the majority partition) and still ensures single-copy consistency. 169 | 170 | I'll discuss this in more detail in the chapter on replication when I discuss Paxos. The important thing is that CP systems incorporate network partitions into their failure model and distinguish between a majority partition and a minority partition using an algorithm like Paxos, Raft or viewstamped replication. CA systems are not partition-aware, and are historically more common: they often use the two-phase commit algorithm and are common in traditional distributed relational databases. 171 | 172 | 173 | 174 | Assuming that a partition occurs, the theorem reduces to a binary choice between availability and consistency. 175 | 176 | ![Based on http://blog.mikiobraun.de/2013/03/misconceptions-about-cap-theorem.html](images/CAP_choice.png) 177 | 178 | 179 | I think there are four conclusions that should be drawn from the CAP theorem: 180 | 181 | First, that *many system designs used in early distributed relational database systems did not take into account partition tolerance* (e.g. they were CA designs). Partition tolerance is an important property for modern systems, since network partitions become much more likely if the system is geographically distributed (as many large systems are). 182 | 183 | Second, that *there is a tension between strong consistency and high availability during network partitions*. The CAP theorem is an illustration of the tradeoffs that occur between strong guarantees and distributed computation. 184 | 185 | In some sense, it is quite crazy to promise that a distributed system consisting of independent nodes connected by an unpredictable network "behaves in a way that is indistinguishable from a non-distributed system". 186 | 187 | ![From the Simpsons episode Trash of the Titans](images/news_120.jpg) 188 | 189 | Strong consistency guarantees require us to give up availability during a partition. This is because one cannot prevent divergence between two replicas that cannot communicate with each other while continuing to accept writes on both sides of the partition. 190 | 191 | How can we work around this? By strengthening the assumptions (assume no partitions) or by weakening the guarantees. Consistency can be traded off against availability (and the related capabilities of offline accessibility and low latency). If "consistency" is defined as something less than "all nodes see the same data at the same time" then we can have both availability and some (weaker) consistency guarantee. 192 | 193 | Third, that *there is a tension between strong consistency and performance in normal operation*. 194 | 195 | Strong consistency / single-copy consistency requires that nodes communicate and agree on every operation. This results in high latency during normal operation. 196 | 197 | If you can live with a consistency model other than the classic one, a consistency model that allows replicas to lag or to diverge, then you can reduce latency during normal operation and maintain availability in the presence of partitions. 198 | 199 | When fewer messages and fewer nodes are involved, an operation can complete faster. But the only way to accomplish that is to relax the guarantees: let some of the nodes be contacted less frequently, which means that nodes can contain old data. 200 | 201 | This also makes it possible for anomalies to occur. You are no longer guaranteed to get the most recent value. Depending on what kinds of guarantees are made, you might read a value that is older than expected, or even lose some updates. 202 | 203 | 204 | 205 | 206 | 207 | Fourth - and somewhat indirectly - that *if we do not want to give up availability during a network partition, then we need to explore whether consistency models other than strong consistency are workable for our purposes*. 208 | 209 | For example, even if user data is georeplicated to multiple datacenters, and the link between those two datacenters is temporarily out of order, in many cases we'll still want to allow the user to use the website / service. This means reconciling two divergent sets of data later on, which is both a technical challenge and a business risk. But often both the technical challenge and the business risk are manageable, and so it is preferable to provide high availability. 210 | 211 | Consistency and availability are not really binary choices, unless you limit yourself to strong consistency. But strong consistency is just one consistency model: the one where you, by necessity, need to give up availability in order to prevent more than a single copy of the data from being active. As [Brewer himself points out](http://www.infoq.com/articles/cap-twelve-years-later-how-the-rules-have-changed), the "2 out of 3" interpretation is misleading. 212 | 213 | If you take away just one idea from this discussion, let it be this: "consistency" is not a singular, unambiguous property. Remember: 214 | 215 |
216 |

217 | [ACID](http://en.wikipedia.org/wiki/ACID) consistency !=
218 | [CAP](http://en.wikipedia.org/wiki/CAP_theorem) consistency !=
219 | [Oatmeal](http://en.wikipedia.org/wiki/Oatmeal) consistency 220 |

221 |
222 | 223 | Instead, a consistency model is a guarantee - any guarantee - that a data store gives to programs that use it. 224 | 225 |
226 |
Consistency model
227 |
a contract between programmer and system, wherein the system guarantees that if the programmer follows some specific rules, the results of operations on the data store will be predictable
228 |
229 | 230 | The "C" in CAP is "strong consistency", but "consistency" is not a synonym for "strong consistency". 231 | 232 | Let's take a look at some alternative consistency models. 233 | 234 | ## Strong consistency vs. other consistency models 235 | 236 | Consistency models can be categorized into two types: strong and weak consistency models: 237 | 238 | - Strong consistency models (capable of maintaining a single copy) 239 | - Linearizable consistency 240 | - Sequential consistency 241 | - Weak consistency models (not strong) 242 | - Client-centric consistency models 243 | - Causal consistency: strongest model available 244 | - Eventual consistency models 245 | 246 | Strong consistency models guarantee that the apparent order and visibility of updates is equivalent to a non-replicated system. Weak consistency models, on the other hand, do not make such guarantees. 247 | 248 | Note that this is by no means an exhaustive list. Again, consistency models are just arbitrary contracts between the programmer and system, so they can be almost anything. 249 | 250 | ### Strong consistency models 251 | 252 | Strong consistency models can further be divided into two similar, but slightly different consistency models: 253 | 254 | - *Linearizable consistency*: Under linearizable consistency, all operations **appear** to have executed atomically in an order that is consistent with the global real-time ordering of operations. (Herlihy & Wing, 1991) 255 | - *Sequential consistency*: Under sequential consistency, all operations **appear** to have executed atomically in some order that is consistent with the order seen at individual nodes and that is equal at all nodes. (Lamport, 1979) 256 | 257 | The key difference is that linearizable consistency requires that the order in which operations take effect is equal to the actual real-time ordering of operations. Sequential consistency allows for operations to be reordered as long as the order observed on each node remains consistent. The only way someone can distinguish between the two is if they can observe all the inputs and timings going into the system; from the perspective of a client interacting with a node, the two are equivalent. 258 | 259 | The difference seems immaterial, but it is worth noting that sequential consistency does not compose. 260 | 261 | Strong consistency models allow you as a programmer to replace a single server with a cluster of distributed nodes and not run into any problems. 262 | 263 | All the other consistency models have anomalies (compared to a system that guarantees strong consistency), because they behave in a way that is distinguishable from a non-replicated system. But often these anomalies are acceptable, either because we don't care about occasional issues or because we've written code that deals with inconsistencies after they have occurred in some way. 264 | 265 | Note that there really aren't any universal typologies for weak consistency models, because "not a strong consistency model" (e.g. "is distinguishable from a non-replicated system in some way") can be almost anything. 266 | 267 | ### Client-centric consistency models 268 | 269 | *Client-centric consistency models* are consistency models that involve the notion of a client or session in some way. For example, a client-centric consistency model might guarantee that a client will never see older versions of a data item. This is often implemented by building additional caching into the client library, so that if a client moves to a replica node that contains old data, then the client library returns its cached value rather than the old value from the replica. 270 | 271 | Clients may still see older versions of the data, if the replica node they are on does not contain the latest version, but they will never see anomalies where an older version of a value resurfaces (e.g. because they connected to a different replica). Note that there are many kinds of consistency models that are client-centric. 272 | 273 | ### Eventual consistency 274 | 275 | The *eventual consistency* model says that if you stop changing values, then after some undefined amount of time all replicas will agree on the same value. It is implied that before that time results between replicas are inconsistent in some undefined manner. Since it is [trivially satisfiable](http://www.bailis.org/blog/safety-and-liveness-eventual-consistency-is-not-safe/) (liveness property only), it is useless without supplemental information. 276 | 277 | Saying something is merely eventually consistent is like saying "people are eventually dead". It's a very weak constraint, and we'd probably want to have at least some more specific characterization of two things: 278 | 279 | First, how long is "eventually"? It would be useful to have a strict lower bound, or at least some idea of how long it typically takes for the system to converge to the same value. 280 | 281 | Second, how do the replicas agree on a value? A system that always returns "42" is eventually consistent: all replicas agree on the same value. It just doesn't converge to a useful value since it just keeps returning the same fixed value. Instead, we'd like to have a better idea of the method. For example, one way to decide is to have the value with the largest timestamp always win. 282 | 283 | So when vendors say "eventual consistency", what they mean is some more precise term, such as "eventually last-writer-wins, and read-the-latest-observed-value in the meantime" consistency. The "how?" matters, because a bad method can lead to writes being lost - for example, if the clock on one node is set incorrectly and timestamps are used. 284 | 285 | I will look into these two questions in more detail in the chapter on replication methods for weak consistency models. 286 | 287 | --- 288 | 289 | ## Further reading 290 | 291 | - [Brewer's Conjecture and the Feasibility of Consistent, Available, Partition-Tolerant Web Services](http://www.comp.nus.edu.sg/~gilbert/pubs/BrewersConjecture-SigAct.pdf) - Gilbert & Lynch, 2002 292 | - [Impossibility of distributed consensus with one faulty process](http://scholar.google.com/scholar?q=Impossibility+of+distributed+consensus+with+one+faulty+process) - Fischer, Lynch and Patterson, 1985 293 | - [Perspectives on the CAP Theorem](http://scholar.google.com/scholar?q=Perspectives+on+the+CAP+Theorem) - Gilbert & Lynch, 2012 294 | - [CAP Twelve Years Later: How the "Rules" Have Changed](http://www.infoq.com/articles/cap-twelve-years-later-how-the-rules-have-changed) - Brewer, 2012 295 | - [Uniform consensus is harder than consensus](http://scholar.google.com/scholar?q=Uniform+consensus+is+harder+than+consensus) - Charron-Bost & Schiper, 2000 296 | - [Replicated Data Consistency Explained Through Baseball](http://pages.cs.wisc.edu/~remzi/Classes/739/Papers/Bart/ConsistencyAndBaseballReport.pdf) - Terry, 2011 297 | - [Life Beyond Distributed Transactions: an Apostate's Opinion](http://scholar.google.com/scholar?q=Life+Beyond+Distributed+Transactions%3A+an+Apostate%27s+Opinion) - Helland, 2007 298 | - [If you have too much data, then 'good enough' is good enough](http://dl.acm.org/citation.cfm?id=1953140) - Helland, 2011 299 | - [Building on Quicksand](http://scholar.google.com/scholar?q=Building+on+Quicksand) - Helland & Campbell, 2009 300 | -------------------------------------------------------------------------------- /input/3_time.md: -------------------------------------------------------------------------------- 1 | # %chapter_number%. Time and order 2 | 3 | What is order and why is it important? 4 | 5 | What do you mean "what is order"? 6 | 7 | I mean, why are we so obsessed with order in the first place? Why do we care whether A happened before B? Why don't we care about some other property, like "color"? 8 | 9 | Well, my crazy friend, let's go back to the definition of distributed systems to answer that. 10 | 11 | As you may remember, I described distributed programming as the art of solving the same problem that you can solve on a single computer using multiple computers. 12 | 13 | This is, in fact, at the core of the obsession with order. Any system that can only do one thing at a time will create a total order of operations. Like people passing through a single door, every operation will have a well-defined predecessor and successor. That's basically the programming model that we've worked very hard to preserve. 14 | 15 | The traditional model is: a single program, one process, one memory space running on one CPU. The operating system abstracts away the fact that there might be multiple CPUs and multiple programs, and that the memory on the computer is actually shared among many programs. I'm not saying that threaded programming and event-oriented programming don't exist; it's just that they are special abstractions on top of the "one/one/one" model. Programs are written to be executed in an ordered fashion: you start from the top, and then go down towards the bottom. 16 | 17 | Order as a property has received so much attention because the easiest way to define "correctness" is to say "it works like it would on a single machine". And that usually means that a) we run the same operations and b) that we run them in the same order - even if there are multiple machines. 18 | 19 | The nice thing about distributed systems that preserve order (as defined for a single system) is that they are generic. You don't need to care about what the operations are, because they will be executed exactly like on a single machine. This is great because you know that you can use the same system no matter what the operations are. 20 | 21 | In reality, a distributed program runs on multiple nodes; with multiple CPUs and multiple streams of operations coming in. You can still assign a total order, but it requires either accurate clocks or some form of communication. You could timestamp each operation using a completely accurate clock then use that to figure out the total order. Or you might have some kind of communication system that makes it possible to assign sequential numbers as in a total order. 22 | 23 | ## Total and partial order 24 | 25 | The natural state in a distributed system is [partial order](http://en.wikipedia.org/wiki/Partially_ordered_set). Neither the network nor independent nodes make any guarantees about relative order; but at each node, you can observe a local order. 26 | 27 | A [total order](http://en.wikipedia.org/wiki/Total_order) is a binary relation that defines an order for every element in some set. 28 | 29 | Two distinct elements are **comparable** when one of them is greater than the other. In a partially ordered set, some pairs of elements are not comparable and hence a partial order doesn't specify the exact order of every item. 30 | 31 | Both total order and partial order are [transitive](http://en.wikipedia.org/wiki/Transitive_relation) and [antisymmetric](http://en.wikipedia.org/wiki/Antisymmetric_relation). The following statements hold in both a total order and a partial order for all a, b and c in X: 32 | 33 | If a ≤ b and b ≤ a then a = b (antisymmetry); 34 | If a ≤ b and b ≤ c then a ≤ c (transitivity); 35 | 36 | However, a total order is [total](http://en.wikipedia.org/wiki/Total_relation): 37 | 38 | a ≤ b or b ≤ a (totality) for all a, b in X 39 | 40 | while a partial order is only [reflexive](http://en.wikipedia.org/wiki/Reflexive_relation): 41 | 42 | a ≤ a (reflexivity) for all a in X 43 | 44 | Note that totality implies reflexivity; so a partial order is a weaker variant of total order. 45 | For some elements in a partial order, the totality property does not hold - in other words, some of the elements are not comparable. 46 | 47 | Git branches are an example of a partial order. As you probably know, the git revision control system allows you to create multiple branches from a single base branch - e.g. from a master branch. Each branch represents a history of source code changes derived based on a common ancestor: 48 | 49 | [ branch A (1,2,0)] [ master (3,0,0) ] [ branch B (1,0,2) ] 50 | [ branch A (1,1,0)] [ master (2,0,0) ] [ branch B (1,0,1) ] 51 | \ [ master (1,0,0) ] / 52 | 53 | The branches A and B were derived from a common ancestor, but there is no definite order between them: they represent different histories and cannot be reduced to a single linear history without additional work (merging). You could, of course, put all the commits in some arbitrary order (say, sorting them first by ancestry and then breaking ties by sorting A before B or B before A) - but that would lose information by forcing a total order where none existed. 54 | 55 | In a system consisting of one node, a total order emerges by necessity: instructions are executed and messages are processed in a specific, observable order in a single program. We've come to rely on this total order - it makes executions of programs predictable. This order can be maintained on a distributed system, but at a cost: communication is expensive, and time synchronization is difficult and fragile. 56 | 57 | # What is time? 58 | 59 | Time is a source of order - it allows us to define the order of operations - which coincidentally also has an interpretation that people can understand (a second, a minute, a day and so on). 60 | 61 | In some sense, time is just like any other integer counter. It just happens to be important enough that most computers have a dedicated time sensor, also known as a clock. It's so important that we've figured out how to synthesize an approximation of the same counter using some imperfect physical system (from wax candles to cesium atoms). By "synthesize", I mean that we can approximate the value of the integer counter in physically distant places via some physical property without communicating it directly. 62 | 63 | Timestamps really are a shorthand value for representing the state of the world from the start of the universe to the current moment - if something occurred at a particular timestamp, then it was potentially influenced by everything that happened before it. This idea can be generalized into a causal clock that explicitly tracks causes (dependencies) rather than simply assuming that everything that preceded a timestamp was relevant. Of course, the usual assumption is that we should only worry about the state of the specific system rather than the whole world. 64 | 65 | Assuming that time progresses at the same rate everywhere - and that is a big assumption which I'll return to in a moment - time and timestamps have several useful interpretations when used in a program. The three interpretations are: 66 | 67 | - Order 68 | - Duration 69 | - Interpretation 70 | 71 | *Order*. When I say that time is a source of order, what I mean is that: 72 | 73 | - we can attach timestamps to unordered events to order them 74 | - we can use timestamps to enforce a specific ordering of operations or the delivery of messages (for example, by delaying an operation if it arrives out of order) 75 | - we can use the value of a timestamp to determine whether something happened chronologically before something else 76 | 77 | *Interpretation* - time as a universally comparable value. The absolute value of a timestamp can be interpreted as a date, which is useful for people. Given a timestamp of when a downtime started from a log file, you can tell that it was last Saturday, when there was a [thunderstorm](https://twitter.com/AWSFail/statuses/218915147060752384). 78 | 79 | *Duration* - durations measured in time have some relation to the real world. Algorithms generally don't care about the absolute value of a clock or its interpretation as a date, but they might use durations to make some judgment calls. In particular, the amount of time spent waiting can provide clues about whether a system is partitioned or merely experiencing high latency. 80 | 81 | By their nature, the components of distributed systems do not behave in a predictable manner. They do not guarantee any specific order, rate of advance, or lack of delay. Each node does have some local order - as execution is (roughly) sequential - but these local orders are independent of each other. 82 | 83 | Imposing (or assuming) order is one way to reduce the space of possible executions and possible occurrences. Humans have a hard time reasoning about things when things can happen in any order - there just are too many permutations to consider. 84 | 85 | ## Does time progress at the same rate everywhere? 86 | 87 | We all have an intuitive concept of time based on our own experience as individuals. Unfortunately, that intuitive notion of time makes it easier to picture total order rather than partial order. It's easier to picture a sequence in which things happen one after another, rather than concurrently. It is easier to reason about a single order of messages than to reason about messages arriving in different orders and with different delays. 88 | 89 | However, when implementing distributing systems we want to avoid making strong assumptions about time and order, because the stronger the assumptions, the more fragile a system is to issues with the "time sensor" - or the onboard clock. Furthermore, imposing an order carries a cost. The more temporal nondeterminism that we can tolerate, the more we can take advantage of distributed computation. 90 | 91 | There are three common answers to the question "does time progress at the same rate everywhere?". These are: 92 | 93 | - "Global clock": yes 94 | - "Local clock": no, but 95 | - "No clock": no! 96 | 97 | These correspond roughly to the three timing assumptions that I mentioned in the second chapter: the synchronous system model has a global clock, the partially synchronous model has a local clock, and in the asynchronous system model one cannot use clocks at all. Let's look at these in more detail. 98 | 99 | ### Time with a "global-clock" assumption 100 | 101 | The global clock assumption is that there is a global clock of perfect accuracy, and that everyone has access to that clock. This is the way we tend to think about time, because in human interactions small differences in time don't really matter. 102 | 103 | ![Global clock](images/global-clock.png) 104 | 105 | The global clock is basically a source of total order (exact order of every operation on all nodes even if those nodes have never communicated). 106 | 107 | However, this is an idealized view of the world: in reality, clock synchronization is only possible to a limited degree of accuracy. This is limited by the lack of accuracy of clocks in commodity computers, by latency if a clock synchronization protocol such as [NTP](http://en.wikipedia.org/wiki/Network_Time_Protocol) is used and fundamentally by [the nature of spacetime](http://en.wikipedia.org/wiki/Time_dilation). 108 | 109 | Assuming that clocks on distributed nodes are perfectly synchronized means assuming that clocks start at the same value and never drift apart. It's a nice assumption because you can use timestamps freely to determine a global total order - bound by clock drift rather than latency - but this is a [nontrivial](http://queue.acm.org/detail.cfm?id=1773943) operational challenge and a potential source of anomalies. There are many different scenarios where a simple failure - such as a user accidentally changing the local time on a machine, or an out-of-date machine joining a cluster, or synchronized clocks drifting at slightly different rates and so on that can cause hard-to-trace anomalies. 110 | 111 | Nevertheless, there are some real-world systems that make this assumption. Facebook's [Cassandra](http://en.wikipedia.org/wiki/Apache_Cassandra) is an example of a system that assumes clocks are synchronized. It uses timestamps to resolve conflicts between writes - the write with the newer timestamp wins. This means that if clocks drift, new data may be ignored or overwritten by old data; again, this is an operational challenge (and from what I've heard, one that people are acutely aware of). Another interesting example is Google's [Spanner](http://research.google.com/archive/spanner.html): the paper describes their TrueTime API, which synchronizes time but also estimates worst-case clock drift. 112 | 113 | ### Time with a "Local-clock" assumption 114 | 115 | The second, and perhaps more plausible assumption is that each machine has its own clock, but there is no global clock. It means that you cannot use the local clock in order to determine whether a remote timestamp occurred before or after a local timestamp; in other words, you cannot meaningfully compare timestamps from two different machines. 116 | 117 | ![Local clock](images/local-clock.png) 118 | 119 | The local clock assumption corresponds more closely to the real world. It assigns a partial order: events on each system are ordered but events cannot be ordered across systems by only using a clock. 120 | 121 | However, you can use timestamps to order events on a single machine; and you can use timeouts on a single machine as long as you are careful not to allow the clock to jump around. Of course, on a machine controlled by an end-user this is probably assuming too much: for example, a user might accidentally change their date to a different value while looking up a date using the operating system's date control. 122 | 123 | 124 | ### Time with a "No-clock" assumption 125 | 126 | Finally, there is the notion of logical time. Here, we don't use clocks at all and instead track causality in some other way. Remember, a timestamp is simply a shorthand for the state of the world up to that point - so we can use counters and communication to determine whether something happened before, after or concurrently with something else. 127 | 128 | This way, we can determine the order of events between different machines, but cannot say anything about intervals and cannot use timeouts (since we assume that there is no "time sensor"). This is a partial order: events can be ordered on a single system using a counter and no communication, but ordering events across systems requires a message exchange. 129 | 130 | One of the most cited papers in distributed systems is Lamport's paper on [time, clocks and the ordering of events](http://research.microsoft.com/users/lamport/pubs/time-clocks.pdf). Vector clocks, a generalization of that concept (which I will cover in more detail), are a way to track causality without using clocks. Cassandra's cousins Riak (Basho) and Voldemort (Linkedin) use vector clocks rather than assuming that nodes have access to a global clock of perfect accuracy. This allows those systems to avoid the clock accuracy issues mentioned earlier. 131 | 132 | When clocks are not used, the maximum precision at which events can be ordered across distant machines is bound by communication latency. 133 | 134 | ## How is time used in a distributed system? 135 | 136 | What is the benefit of time? 137 | 138 | 1. Time can define order across a system (without communication) 139 | 2. Time can define boundary conditions for algorithms 140 | 141 | The order of events is important in distributed systems, because many properties of distributed systems are defined in terms of the order of operations/events: 142 | 143 | - where correctness depends on (agreement on) correct event ordering, for example serializability in a distributed database 144 | - order can be used as a tie breaker when resource contention occurs, for example if there are two orders for a widget, fulfill the first and cancel the second one 145 | 146 | A global clock would allow operations on two different machines to be ordered without the two machines communicating directly. Without a global clock, we need to communicate in order to determine order. 147 | 148 | Time can also be used to define boundary conditions for algorithms - specifically, to distinguish between "high latency" and "server or network link is down". This is a very important use case; in most real-world systems timeouts are used to determine whether a remote machine has failed, or whether it is simply experiencing high network latency. Algorithms that make this determination are called failure detectors; and I will discuss them fairly soon. 149 | 150 | ## Vector clocks (time for causal order) 151 | 152 | Earlier, we discussed the different assumptions about the rate of progress of time across a distributed system. Assuming that we cannot achieve accurate clock synchronization - or starting with the goal that our system should not be sensitive to issues with time synchronization, how can we order things? 153 | 154 | Lamport clocks and vector clocks are replacements for physical clocks which rely on counters and communication to determine the order of events across a distributed system. These clocks provide a counter that is comparable across different nodes. 155 | 156 | *A Lamport clock* is simple. Each process maintains a counter using the following rules: 157 | 158 | - Whenever a process does work, increment the counter 159 | - Whenever a process sends a message, include the counter 160 | - When a message is received, set the counter to `max(local_counter, received_counter) + 1` 161 | 162 | Expressed as code: 163 | 164 | function LamportClock() { 165 | this.value = 1; 166 | } 167 | 168 | LamportClock.prototype.get = function() { 169 | return this.value; 170 | } 171 | 172 | LamportClock.prototype.increment = function() { 173 | this.value++; 174 | } 175 | 176 | LamportClock.prototype.merge = function(other) { 177 | this.value = Math.max(this.value, other.value) + 1; 178 | } 179 | 180 | A [Lamport clock](http://en.wikipedia.org/wiki/Lamport_timestamps) allows counters to be compared across systems, with a caveat: Lamport clocks define a partial order. If `timestamp(a) < timestamp(b)`: 181 | 182 | - `a` may have happened before `b` or 183 | - `a` may be incomparable with `b` 184 | 185 | This is known as clock consistency condition: if one event comes before another, then that event's logical clock comes before the others. If `a` and `b` are from the same causal history, e.g. either both timestamp values were produced on the same process; or `b` is a response to the message sent in `a` then we know that `a` happened before `b`. 186 | 187 | Intuitively, this is because a Lamport clock can only carry information about one timeline / history; hence, comparing Lamport timestamps from systems that never communicate with each other may cause concurrent events to appear to be ordered when they are not. 188 | 189 | Imagine a system that after an initial period divides into two independent subsystems which never communicate with each other. 190 | 191 | For all events in each independent system, if a happened before b, then `ts(a) < ts(b)`; but if you take two events from the different independent systems (e.g. events that are not causally related) then you cannot say anything meaningful about their relative order. While each part of the system has assigned timestamps to events, those timestamps have no relation to each other. Two events may appear to be ordered even though they are unrelated. 192 | 193 | However - and this is still a useful property - from the perspective of a single machine, any message sent with `ts(a)` will receive a response with `ts(b)` which is `> ts(a)`. 194 | 195 | *A vector clock* is an extension of Lamport clock, which maintains an array `[ t1, t2, ... ]` of N logical clocks - one per each node. Rather than incrementing a common counter, each node increments its own logical clock in the vector by one on each internal event. Hence the update rules are: 196 | 197 | - Whenever a process does work, increment the logical clock value of the node in the vector 198 | - Whenever a process sends a message, include the full vector of logical clocks 199 | - When a message is received: 200 | - update each element in the vector to be `max(local, received)` 201 | - increment the logical clock value representing the current node in the vector 202 | 203 | Again, expressed as code: 204 | 205 | function VectorClock(value) { 206 | // expressed as a hash keyed by node id: e.g. { node1: 1, node2: 3 } 207 | this.value = value || {}; 208 | } 209 | 210 | VectorClock.prototype.get = function() { 211 | return this.value; 212 | }; 213 | 214 | VectorClock.prototype.increment = function(nodeId) { 215 | if(typeof this.value[nodeId] == 'undefined') { 216 | this.value[nodeId] = 1; 217 | } else { 218 | this.value[nodeId]++; 219 | } 220 | }; 221 | 222 | VectorClock.prototype.merge = function(other) { 223 | var result = {}, last, 224 | a = this.value, 225 | b = other.value; 226 | // This filters out duplicate keys in the hash 227 | (Object.keys(a) 228 | .concat(b)) 229 | .sort() 230 | .filter(function(key) { 231 | var isDuplicate = (key == last); 232 | last = key; 233 | return !isDuplicate; 234 | }).forEach(function(key) { 235 | result[key] = Math.max(a[key] || 0, b[key] || 0); 236 | }); 237 | this.value = result; 238 | }; 239 | 240 | This illustration ([source](http://en.wikipedia.org/wiki/Vector_clock)) shows a vector clock: 241 | 242 | ![from http://en.wikipedia.org/wiki/Vector_clock](images/vector_clock.svg.png) 243 | 244 | Each of the three nodes (A, B, C) keeps track of the vector clock. As events occur, they are timestamped with the current value of the vector clock. Examining a vector clock such as `{ A: 2, B: 4, C: 1 }` lets us accurately identify the messages that (potentially) influenced that event. 245 | 246 | The issue with vector clocks is mainly that they require one entry per node, which means that they can potentially become very large for large systems. A variety of techniques have been applied to reduce the size of vector clocks (either by performing periodic garbage collection, or by reducing accuracy by limiting the size). 247 | 248 | We've looked at how order and causality can be tracked without physical clocks. Now, let's look at how time durations can be used for cutoff. 249 | 250 | ## Failure detectors (time for cutoff) 251 | 252 | As I stated earlier, the amount of time spent waiting can provide clues about whether a system is partitioned or merely experiencing high latency. In this case, we don't need to assume a global clock of perfect accuracy - it is simply enough that there is a reliable-enough local clock. 253 | 254 | Given a program running on one node, how can it tell that a remote node has failed? In the absence of accurate information, we can infer that an unresponsive remote node has failed after some reasonable amount of time has passed. 255 | 256 | But what is a "reasonable amount"? This depends on the latency between the local and remote nodes. Rather than explicitly specifying algorithms with specific values (which would inevitably be wrong in some cases), it would be nicer to deal with a suitable abstraction. 257 | 258 | A failure detector is a way to abstract away the exact timing assumptions. Failure detectors are implemented using heartbeat messages and timers. Processes exchange heartbeat messages. If a message response is not received before the timeout occurs, then the process suspects the other process. 259 | 260 | A failure detector based on a timeout will carry the risk of being either overly aggressive (declaring a node to have failed) or being overly conservative (taking a long time to detect a crash). How accurate do failure detectors need to be for them to be usable? 261 | 262 | [Chandra et al.](http://www.google.com/search?q=Unreliable%20Failure%20Detectors%20for%20Reliable%20Distributed%20Systems) (1996) discuss failure detectors in the context of solving consensus - a problem that is particularly relevant since it underlies most replication problems where the replicas need to agree in environments with latency and network partitions. 263 | 264 | They characterize failure detectors using two properties, completeness and accuracy: 265 | 266 |
267 |
Strong completeness.
268 |
Every crashed process is eventually suspected by every correct process.
269 |
Weak completeness.
270 |
Every crashed process is eventually suspected by some correct process.
271 |
Strong accuracy.
272 |
No correct process is suspected ever.
273 |
Weak accuracy.
274 |
Some correct process is never suspected.
275 |
276 | 277 | Completeness is easier to achieve than accuracy; indeed, all failure detectors of importance achieve it - all you need to do is not to wait forever to suspect someone. Chandra et al. note that a failure detector with weak completeness can be transformed to one with strong completeness (by broadcasting information about suspected processes), allowing us to concentrate on the spectrum of accuracy properties. 278 | 279 | Avoiding incorrectly suspecting non-faulty processes is hard unless you are able to assume that there is a hard maximum on the message delay. That assumption can be made in a synchronous system model - and hence failure detectors can be strongly accurate in such a system. Under system models that do not impose hard bounds on message delay, failure detection can at best be eventually accurate. 280 | 281 | Chandra et al. show that even a very weak failure detector - the eventually weak failure detector ⋄W (eventually weak accuracy + weak completeness) - can be used to solve the consensus problem. The diagram below (from the paper) illustrates the relationship between system models and problem solvability: 282 | 283 | ![From Chandra and Toueg. Unreliable failure detectors for reliable distributed systems. JACM 43(2):225–267, 1996.](images/chandra_failure_detectors.png) 284 | 285 | As you can see above, certain problems are not solvable without a failure detector in asynchronous systems. This is because without a failure detector (or strong assumptions about time bounds e.g. the synchronous system model), it is not possible to tell whether a remote node has crashed, or is simply experiencing high latency. That distinction is important for any system that aims for single-copy consistency: failed nodes can be ignored because they cannot cause divergence, but partitioned nodes cannot be safely ignored. 286 | 287 | How can one implement a failure detector? Conceptually, there isn't much to a simple failure detector, which simply detects failure when a timeout expires. The most interesting part relates to how the judgments are made about whether a remote node has failed. 288 | 289 | Ideally, we'd prefer the failure detector to be able to adjust to changing network conditions and to avoid hardcoding timeout values into it. For example, Cassandra uses an [accrual failure detector](https://www.google.com/search?q=The+Phi+accrual+failure+detector), which is a failure detector that outputs a suspicion level (a value between 0 and 1) rather than a binary "up" or "down" judgment. This allows the application using the failure detector to make its own decisions about the tradeoff between accurate detection and early detection. 290 | 291 | ## Time, order and performance 292 | 293 | Earlier, I alluded to having to pay the cost for order. What did I mean? 294 | 295 | If you're writing a distributed system, you presumably own more than one computer. The natural (and realistic) view of the world is a partial order, not a total order. You can transform a partial order into a total order, but this requires communication, waiting and imposes restrictions that limit how many computers can do work at any particular point in time. 296 | 297 | All clocks are mere approximations bound by either network latency (logical time) or by physics. Even keeping a simple integer counter in sync across multiple nodes is a challenge. 298 | 299 | While time and order are often discussed together, time itself is not such a useful property. Algorithms don't really care about time as much as they care about more abstract properties: 300 | 301 | - the causal ordering of events 302 | - failure detection (e.g. approximations of upper bounds on message delivery) 303 | - consistent snapshots (e.g. the ability to examine the state of a system at some point in time; not discussed here) 304 | 305 | Imposing a total order is possible, but expensive. It requires you to proceed at the common (lowest) speed. Often the easiest way to ensure that events are delivered in some defined order is to nominate a single (bottleneck) node through which all operations are passed. 306 | 307 | Is time / order / synchronicity really necessary? It depends. In some use cases, we want each intermediate operation to move the system from one consistent state to another. For example, in many cases we want the responses from a database to represent all of the available information, and we want to avoid dealing with the issues that might occur if the system could return an inconsistent result. 308 | 309 | But in other cases, we might not need that much time / order / synchronization. For example, if you are running a long running computation, and don't really care about what the system does until the very end - then you don't really need much synchronization as long as you can guarantee that the answer is correct. 310 | 311 | Synchronization is often applied as a blunt tool across all operations, when only a subset of cases actually matter for the final outcome. When is order needed to guarantee correctness? The CALM theorem - which I will discuss in the last chapter - provides one answer. 312 | 313 | In other cases, it is acceptable to give an answer that only represents the best known estimate - that is, is based on only a subset of the total information contained in the system. In particular, during a network partition one may need to answer queries with only a part of the system being accessible. In other use cases, the end user cannot really distinguish between a relatively recent answer that can be obtained cheaply and one that is guaranteed to be correct and is expensive to calculate. For example, is the Twitter follower count for some user X, or X+1? Or are movies A, B and C the absolutely best answers for some query? Doing a cheaper, mostly correct "best effort" can be acceptable. 314 | 315 | In the next two chapters we'll examine replication for fault-tolerant strongly consistent systems - systems which provide strong guarantees while being increasingly resilient to failures. These systems provide solutions for the first case: when you need to guarantee correctness and are willing to pay for it. Then, we'll discuss systems with weak consistency guarantees, which can remain available in the face of partitions, but that can only give you a "best effort" answer. 316 | 317 | --- 318 | 319 | ## Further reading 320 | 321 | ### Lamport clocks, vector clocks 322 | 323 | - [Time, Clocks and Ordering of Events in a Distributed System](http://research.microsoft.com/users/lamport/pubs/time-clocks.pdf) - Leslie Lamport, 1978 324 | 325 | ### Failure detection 326 | 327 | - [Unreliable failure detectors and reliable distributed systems](http://scholar.google.com/scholar?q=Unreliable+Failure+Detectors+for+Reliable+Distributed+Systems) - Chandra and Toueg 328 | - [Latency- and Bandwidth-Minimizing Optimal Failure Detectors](http://www.cs.cornell.edu/people/egs/sqrt-s/doc/TR2006-2025.pdf) - So & Sirer, 2007 329 | - [The failure detector abstraction](http://scholar.google.com/scholar?q=The+failure+detector+abstraction), Freiling, Guerraoui & Kuznetsov, 2011 330 | 331 | ### Snapshots 332 | 333 | - [Consistent global states of distributed systems: Fundamental concepts and mechanisms](http://scholar.google.com/scholar?q=Consistent+global+states+of+distributed+systems%3A+Fundamental+concepts+and+mechanisms), Ozalp Babaogly and Keith Marzullo, 1993 334 | - [Distributed snapshots: Determining global states of distributed systems](http://scholar.google.com/scholar?q=Distributed+snapshots%3A+Determining+global+states+of+distributed+systems), K. Mani Chandy and Leslie Lamport, 1985 335 | 336 | ### Causality 337 | 338 | - [Detecting Causal Relationships in Distributed Computations: In Search of the Holy Grail](http://www.vs.inf.ethz.ch/publ/papers/holygrail.pdf) - Schwarz & Mattern, 1994 339 | - [Understanding the Limitations of Causally and Totally Ordered Communication](http://scholar.google.com/scholar?q=Understanding+the+limitations+of+causally+and+totally+ordered+communication) - Cheriton & Skeen, 1993 340 | -------------------------------------------------------------------------------- /input/6_appendix.md: -------------------------------------------------------------------------------- 1 | # %chapter_number%. Further reading and appendix 2 | 3 | If you've made it this far, thank you. 4 | 5 | If you liked the book, follow me on [Github](https://github.com/mixu/) (or [Twitter](http://twitter.com/mikitotakada)). I love seeing that I've had some kind of positive impact. "Create more value than you capture" and all that. 6 | 7 | Many many thanks to: logpath, alexras, globalcitizen, graue, frankshearar, roryokane, jpfuentes2, eeror, cmeiklejohn, stevenproctor eos2102 and steveloughran for their help! Of course, any mistakes and omissions that remain are my fault! 8 | 9 | It's worth noting that my chapter on eventual consistency is fairly Berkeley-centric; I'd like to change that. I've also skipped one prominent use case for time: consistent snapshots. There are also a couple of topics which I should expand on: namely, an explicit discussion of safety and liveness properties and a more detailed discussion of consistent hashing. However, I'm off to [Strange Loop 2013](https://thestrangeloop.com/), so whatever. 10 | 11 | If this book had a chapter 6, it would probably be about the ways in which one can make use of and deal with large amounts of data. It seems that the most common type of "big data" computation is one in which [a large dataset is passed through a single simple program](http://en.wikipedia.org/wiki/SPMD). I'm not sure what the subsequent chapters would be (perhaps high performance computing, given that the current focus has been on feasibility), but I'll probably know in a couple of years. 12 | 13 | ## Books about distributed systems 14 | 15 | #### Distributed Algorithms (Lynch) 16 | 17 | This is probably the most frequently recommended book on distributed algorithms. I'd also recommend it, but with a caveat. It is very comprehensive, but written for a graduate student audience, so you'll spend a lot of time reading about synchronous systems and shared memory algorithms before getting to things that are most interesting to a practitioner. 18 | 19 | #### Introduction to Reliable and Secure Distributed Programming (Cachin, Guerraoui & Rodrigues) 20 | 21 | For a practitioner, this is a fun one. It's short and full of actual algorithm implementations. 22 | 23 | #### Replication: Theory and Practice 24 | 25 | If you're interested in replication, this book is amazing. The chapter on replication is largely based on a synthesis of the interesting parts of this book plus more recent readings. 26 | 27 | #### Distributed Systems: An Algorithmic Approach (Ghosh) 28 | 29 | #### Introduction to Distributed Algorithms (Tel) 30 | 31 | #### Transactional Information Systems: Theory, Algorithms, and the Practice of Concurrency Control and Recovery (Weikum & Vossen) 32 | 33 | This book is on traditional transactional information systems, e.g. local RDBMS's. There are two chapters on distributed transactions at the end, but the focus of the book is on transaction processing. 34 | 35 | #### Transaction Processing: Concepts and Techniques by Gray and Reuter 36 | 37 | A classic. I find that Weikum & Vossen is more up to date. 38 | 39 | ## Seminal papers 40 | 41 | Each year, the [Edsger W. Dijkstra Prize in Distributed Computing](http://en.wikipedia.org/wiki/Dijkstra_Prize) is given to outstanding papers on the principles of distributed computing. Check out the link for the full list, which includes classics such as: 42 | 43 | - "[Time, Clocks and Ordering of Events in a Distributed System](http://research.microsoft.com/users/lamport/pubs/time-clocks.pdf)" - Leslie Lamport 44 | - "[Impossibility of Distributed Consensus With One Faulty Process](http://theory.lcs.mit.edu/tds/papers/Lynch/jacm85.pdf)" - Fisher, Lynch, Patterson 45 | - "[Unreliable failure detectors and reliable distributed systems](http://scholar.google.com/scholar?q=Unreliable+Failure+Detectors+for+Reliable+Distributed+Systems)" - Chandra and Toueg 46 | 47 | Microsoft Academic Search has a list of [top publications in distributed & parallel computing ordered by number of citations](http://libra.msra.cn/RankList?entitytype=1&topDomainID=2&subDomainID=16&last=0&start=1&end=100) - this may be an interesting list to skim for more classics. 48 | 49 | Here are some additional lists of recommended papers: 50 | 51 | - [Nancy Lynch's recommended reading list](http://courses.csail.mit.edu/6.852/08/handouts/handout3.pdf) from her course on Distributed systems. 52 | - [NoSQL Summer paper list](http://nosqlsummer.org/papers) - a curated list of papers related to this buzzword. 53 | - [A Quora question on seminal papers in distributed systems](http://www.quora.com/What-are-the-seminal-papers-in-distributed-systems-Why). 54 | 55 | ### Systems 56 | 57 | - [The Google File System](http://research.google.com/archive/gfs.html) - Ghemawat, Gobioff and Leung 58 | - [MapReduce: Simplified Data Processing on Large Clusters](http://research.google.com/archive/mapreduce.html) - Dean and Ghemawat 59 | - [Dynamo: Amazon’s Highly Available Key-value Store](http://scholar.google.com/scholar?q=Dynamo%3A+Amazon's+Highly+Available+Key-value+Store) - DeCandia et al. 60 | - [Bigtable: A Distributed Storage System for Structured Data](http://research.google.com/archive/bigtable.html) - Chang et al. 61 | - [The Chubby Lock Service for Loosely-Coupled Distributed Systems](http://research.google.com/archive/chubby.html) - Burrows 62 | - [ZooKeeper: Wait-free coordination for Internet-scale systems](http://www.usenix.org/event/usenix10/tech/full_papers/Hunt.pdf) - Hunt, Konar, Junqueira, Reed, 2010 63 | -------------------------------------------------------------------------------- /layouts/default/footer.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
9 |
10 |
11 | 12 | 15 | 16 |
17 |
18 |
19 | 20 | 23 | 24 |
25 |
26 |
27 | 28 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /layouts/default/header.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Distributed systems for fun and profit 5 | 6 | 14 | 15 | 16 | 17 | 18 | 27 | 28 | 29 |
30 |
31 |
32 |

Distributed systems

33 |

for fun and profit

34 |
35 | 42 |
43 | 44 | 45 |
46 |
47 |
48 | 49 | 50 | 54 |
55 |
56 |
57 | 58 | 59 | 60 |
61 |
62 | -------------------------------------------------------------------------------- /layouts/default/index-insert.html: -------------------------------------------------------------------------------- 1 | 22 |
23 |
24 |
25 | 26 | 31 | 32 |
33 |
34 |
35 | -------------------------------------------------------------------------------- /layouts/default/single-insert.html: -------------------------------------------------------------------------------- 1 | 13 |
14 |
15 |
16 | 17 | 20 | 21 |
22 |
23 |
24 | -------------------------------------------------------------------------------- /output/appendix.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Distributed systems for fun and profit 5 | 6 | 14 | 15 | 16 | 17 | 18 | 27 | 28 | 29 |
30 |
31 |
32 |

Distributed systems

33 |

for fun and profit

34 |
35 | 42 |
43 | 44 | 45 |
46 |
47 |
48 | 49 | 50 | 54 |
55 |
56 |
57 | 58 | 59 | 60 |
61 |
62 |

6. Further reading and appendix

63 |

If you've made it this far, thank you.

64 |

If you liked the book, follow me on Github (or Twitter). I love seeing that I've had some kind of positive impact. "Create more value than you capture" and all that.

65 |

Many many thanks to: logpath, alexras, globalcitizen, graue, frankshearar, roryokane, jpfuentes2, eeror, cmeiklejohn, stevenproctor eos2102 and steveloughran for their help! Of course, any mistakes and omissions that remain are my fault!

66 |

It's worth noting that my chapter on eventual consistency is fairly Berkeley-centric; I'd like to change that. I've also skipped one prominent use case for time: consistent snapshots. There are also a couple of topics which I should expand on: namely, an explicit discussion of safety and liveness properties and a more detailed discussion of consistent hashing. However, I'm off to Strange Loop 2013, so whatever.

67 |

If this book had a chapter 6, it would probably be about the ways in which one can make use of and deal with large amounts of data. It seems that the most common type of "big data" computation is one in which a large dataset is passed through a single simple program. I'm not sure what the subsequent chapters would be (perhaps high performance computing, given that the current focus has been on feasibility), but I'll probably know in a couple of years.

68 |

Books about distributed systems

69 |

Distributed Algorithms (Lynch)

70 |

This is probably the most frequently recommended book on distributed algorithms. I'd also recommend it, but with a caveat. It is very comprehensive, but written for a graduate student audience, so you'll spend a lot of time reading about synchronous systems and shared memory algorithms before getting to things that are most interesting to a practitioner.

71 |

Introduction to Reliable and Secure Distributed Programming (Cachin, Guerraoui & Rodrigues)

72 |

For a practitioner, this is a fun one. It's short and full of actual algorithm implementations.

73 |

Replication: Theory and Practice

74 |

If you're interested in replication, this book is amazing. The chapter on replication is largely based on a synthesis of the interesting parts of this book plus more recent readings.

75 |

Distributed Systems: An Algorithmic Approach (Ghosh)

76 |

Introduction to Distributed Algorithms (Tel)

77 |

Transactional Information Systems: Theory, Algorithms, and the Practice of Concurrency Control and Recovery (Weikum & Vossen)

78 |

This book is on traditional transactional information systems, e.g. local RDBMS's. There are two chapters on distributed transactions at the end, but the focus of the book is on transaction processing.

79 |

Transaction Processing: Concepts and Techniques by Gray and Reuter

80 |

A classic. I find that Weikum & Vossen is more up to date.

81 |

Seminal papers

82 |

Each year, the Edsger W. Dijkstra Prize in Distributed Computing is given to outstanding papers on the principles of distributed computing. Check out the link for the full list, which includes classics such as:

83 | 88 |

Microsoft Academic Search has a list of top publications in distributed & parallel computing ordered by number of citations - this may be an interesting list to skim for more classics.

89 |

Here are some additional lists of recommended papers:

90 | 95 |

Systems

96 | 104 | 105 |
106 | 107 | 108 |
109 | 110 | 111 |
112 |
113 |
114 | 115 | 118 | 119 |
120 |
121 |
122 | 123 | 126 | 127 |
128 |
129 |
130 | 131 | 149 | 150 | 151 | 152 |
153 |
154 | 155 | 156 |
157 | 158 | 159 | -------------------------------------------------------------------------------- /output/assets/assert.css: -------------------------------------------------------------------------------- 1 | .FAIL b, .ERROR b { color: red; /* #990066 */ } 2 | .PASS b { color: #73C836; } 3 | input.runner { 4 | margin-top: 0.1em; 5 | margin-bottom: 1em; 6 | font-size: large; 7 | padding: 0.3em; 8 | } 9 | ol.runner { margin-bottom: 0.2em; } 10 | pre.run { margin-bottom: 0.2em;} 11 | -------------------------------------------------------------------------------- /output/assets/bgnoise.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/assets/bgnoise.png -------------------------------------------------------------------------------- /output/assets/ebook.css: -------------------------------------------------------------------------------- 1 | html { font-size: 100%; overflow-y: scroll; -webkit-text-size-adjust: 100%; -ms-text-size-adjust: 100%; } 2 | 3 | body{ 4 | font-family:Georgia, Palatino, 'Palatino Linotype', Times, 'Times New Roman', serif; 5 | font-size:12px; 6 | line-height:1.5em; 7 | padding:1em; 8 | margin:auto; 9 | max-width:42em; 10 | background:#fefefe; 11 | } 12 | 13 | a{ color: #0645ad; text-decoration:none;} 14 | a:visited{ color: #0b0080; } 15 | a:hover{ color: #06e; } 16 | a:active{ color:#faa700; } 17 | a:focus{ outline: thin dotted; } 18 | a:hover, a:active{ outline: 0; } 19 | 20 | p{ 21 | margin:1em 0; 22 | } 23 | 24 | img{ 25 | max-width:100%; 26 | } 27 | 28 | h1,h2,h3,h4,h5,h6{ 29 | font-weight:normal; 30 | line-height:1em; 31 | margin-bottom:0em; 32 | } 33 | h4,h5,h6{ font-weight: bold; } 34 | h1{ font-size:2.5em; } 35 | h2{ font-size:2em; } 36 | h3{ font-size:1.5em; } 37 | h4{ font-size:1.2em; } 38 | h5{ font-size:1em; } 39 | h6{ font-size:0.9em; } 40 | 41 | blockquote { 42 | margin:0; 43 | padding: 15px 15px 15px 0px; 44 | font-style: italic; 45 | } 46 | 47 | blockquote p { 48 | margin: 0; 49 | } 50 | 51 | hr { 52 | display: block; 53 | height: 2px; 54 | border: 0; 55 | border-top: 1px solid #000; 56 | border-bottom: 1px solid #000; 57 | margin: 1em 0; 58 | padding: 0; 59 | } 60 | pre, code, kbd, samp { color: #000; font-family: monospace; font-size: 0.98em; } 61 | pre { white-space: pre; white-space: pre-wrap; word-wrap: break-word; } 62 | 63 | b, strong { font-weight: bold; } 64 | 65 | dfn { font-style: italic; } 66 | 67 | ins { background: #ff9; color: #000; text-decoration: none; } 68 | 69 | mark { background: #ff0; color: #000; font-style: italic; font-weight: bold; } 70 | 71 | sub, sup { font-size: 75%; line-height: 0; position: relative; vertical-align: baseline; } 72 | sup { top: -0.5em; } 73 | sub { bottom: -0.25em; } 74 | 75 | ul, ol { margin: 1em 0; padding: 0 0 0 2em; } 76 | li p:last-child { margin:0 } 77 | dd { margin: 0 0 0 2em; } 78 | 79 | img { border: 0; -ms-interpolation-mode: bicubic; vertical-align: middle; } 80 | 81 | table { border-collapse: collapse; border-spacing: 0; } 82 | td { vertical-align: top; } 83 | 84 | @media only screen and (min-width: 480px) { 85 | body{font-size:14px;} 86 | } 87 | 88 | @media only screen and (min-width: 768px) { 89 | body{font-size:16px;} 90 | } 91 | -------------------------------------------------------------------------------- /output/assets/prettify.css: -------------------------------------------------------------------------------- 1 | /* Pretty printing styles. Used with prettify.js. */ 2 | 3 | /* SPAN elements with the classes below are added by prettyprint. */ 4 | .pln { color: #000 } /* plain text */ 5 | 6 | @media screen { 7 | .str { color: #080 } /* string content */ 8 | .kwd { color: #008 } /* a keyword */ 9 | .com { color: #800 } /* a comment */ 10 | .typ { color: #606 } /* a type name */ 11 | .lit { color: #066 } /* a literal value */ 12 | /* punctuation, lisp open bracket, lisp close bracket */ 13 | .pun, .opn, .clo { color: #660 } 14 | .tag { color: #008 } /* a markup tag name */ 15 | .atn { color: #606 } /* a markup attribute name */ 16 | .atv { color: #080 } /* a markup attribute value */ 17 | .dec, .var { color: #606 } /* a declaration; a variable name */ 18 | .fun { color: red } /* a function name */ 19 | } 20 | 21 | /* Use higher contrast and text-weight for printable form. */ 22 | @media print, projection { 23 | .str { color: #060 } 24 | .kwd { color: #006; font-weight: bold } 25 | .com { color: #600; font-style: italic } 26 | .typ { color: #404; font-weight: bold } 27 | .lit { color: #044 } 28 | .pun, .opn, .clo { color: #440 } 29 | .tag { color: #006; font-weight: bold } 30 | .atn { color: #404 } 31 | .atv { color: #060 } 32 | } 33 | 34 | /* Put a border around prettyprinted code snippets. */ 35 | pre.prettyprint { background: none; font-size: 14px;} 36 | 37 | /* Specify class=linenums on a pre to get line numbering */ 38 | ol.linenums { margin-top: 0; margin-bottom: 0 } /* IE indents via margin-left */ 39 | li.L0, 40 | li.L1, 41 | li.L2, 42 | li.L3, 43 | li.L5, 44 | li.L6, 45 | li.L7, 46 | li.L8 { list-style-type: none } 47 | /* Alternate shading for lines */ 48 | li.L1, 49 | li.L3, 50 | li.L5, 51 | li.L7, 52 | li.L9 { background: #eee } 53 | -------------------------------------------------------------------------------- /output/assets/prettify.js: -------------------------------------------------------------------------------- 1 | var q=null;window.PR_SHOULD_USE_CONTINUATION=!0; 2 | (function(){function L(a){function m(a){var f=a.charCodeAt(0);if(f!==92)return f;var b=a.charAt(1);return(f=r[b])?f:"0"<=b&&b<="7"?parseInt(a.substring(1),8):b==="u"||b==="x"?parseInt(a.substring(2),16):a.charCodeAt(1)}function e(a){if(a<32)return(a<16?"\\x0":"\\x")+a.toString(16);a=String.fromCharCode(a);if(a==="\\"||a==="-"||a==="["||a==="]")a="\\"+a;return a}function h(a){for(var f=a.substring(1,a.length-1).match(/\\u[\dA-Fa-f]{4}|\\x[\dA-Fa-f]{2}|\\[0-3][0-7]{0,2}|\\[0-7]{1,2}|\\[\S\s]|[^\\]/g),a= 3 | [],b=[],o=f[0]==="^",c=o?1:0,i=f.length;c122||(d<65||j>90||b.push([Math.max(65,j)|32,Math.min(d,90)|32]),d<97||j>122||b.push([Math.max(97,j)&-33,Math.min(d,122)&-33]))}}b.sort(function(a,f){return a[0]-f[0]||f[1]-a[1]});f=[];j=[NaN,NaN];for(c=0;ci[0]&&(i[1]+1>i[0]&&b.push("-"),b.push(e(i[1])));b.push("]");return b.join("")}function y(a){for(var f=a.source.match(/\[(?:[^\\\]]|\\[\S\s])*]|\\u[\dA-Fa-f]{4}|\\x[\dA-Fa-f]{2}|\\\d+|\\[^\dux]|\(\?[!:=]|[()^]|[^()[\\^]+/g),b=f.length,d=[],c=0,i=0;c=2&&a==="["?f[c]=h(j):a!=="\\"&&(f[c]=j.replace(/[A-Za-z]/g,function(a){a=a.charCodeAt(0);return"["+String.fromCharCode(a&-33,a|32)+"]"}));return f.join("")}for(var t=0,s=!1,l=!1,p=0,d=a.length;p=5&&"lang-"===b.substring(0,5))&&!(o&&typeof o[1]==="string"))c=!1,b="src";c||(r[f]=b)}i=d;d+=f.length;if(c){c=o[1];var j=f.indexOf(c),k=j+c.length;o[2]&&(k=f.length-o[2].length,j=k-c.length);b=b.substring(5);B(l+i,f.substring(0,j),e,p);B(l+i+j,c,C(b,c),p);B(l+i+k,f.substring(k),e,p)}else p.push(l+i,b)}a.e=p}var h={},y;(function(){for(var e=a.concat(m), 9 | l=[],p={},d=0,g=e.length;d=0;)h[n.charAt(k)]=r;r=r[1];n=""+r;p.hasOwnProperty(n)||(l.push(r),p[n]=q)}l.push(/[\S\s]/);y=L(l)})();var t=m.length;return e}function u(a){var m=[],e=[];a.tripleQuotedStrings?m.push(["str",/^(?:'''(?:[^'\\]|\\[\S\s]|''?(?=[^']))*(?:'''|$)|"""(?:[^"\\]|\\[\S\s]|""?(?=[^"]))*(?:"""|$)|'(?:[^'\\]|\\[\S\s])*(?:'|$)|"(?:[^"\\]|\\[\S\s])*(?:"|$))/,q,"'\""]):a.multiLineStrings?m.push(["str",/^(?:'(?:[^'\\]|\\[\S\s])*(?:'|$)|"(?:[^"\\]|\\[\S\s])*(?:"|$)|`(?:[^\\`]|\\[\S\s])*(?:`|$))/, 10 | q,"'\"`"]):m.push(["str",/^(?:'(?:[^\n\r'\\]|\\.)*(?:'|$)|"(?:[^\n\r"\\]|\\.)*(?:"|$))/,q,"\"'"]);a.verbatimStrings&&e.push(["str",/^@"(?:[^"]|"")*(?:"|$)/,q]);var h=a.hashComments;h&&(a.cStyleComments?(h>1?m.push(["com",/^#(?:##(?:[^#]|#(?!##))*(?:###|$)|.*)/,q,"#"]):m.push(["com",/^#(?:(?:define|elif|else|endif|error|ifdef|include|ifndef|line|pragma|undef|warning)\b|[^\n\r]*)/,q,"#"]),e.push(["str",/^<(?:(?:(?:\.\.\/)*|\/?)(?:[\w-]+(?:\/[\w-]+)+)?[\w-]+\.h|[a-z]\w*)>/,q])):m.push(["com",/^#[^\n\r]*/, 11 | q,"#"]));a.cStyleComments&&(e.push(["com",/^\/\/[^\n\r]*/,q]),e.push(["com",/^\/\*[\S\s]*?(?:\*\/|$)/,q]));a.regexLiterals&&e.push(["lang-regex",/^(?:^^\.?|[!+-]|!=|!==|#|%|%=|&|&&|&&=|&=|\(|\*|\*=|\+=|,|-=|->|\/|\/=|:|::|;|<|<<|<<=|<=|=|==|===|>|>=|>>|>>=|>>>|>>>=|[?@[^]|\^=|\^\^|\^\^=|{|\||\|=|\|\||\|\|=|~|break|case|continue|delete|do|else|finally|instanceof|return|throw|try|typeof)\s*(\/(?=[^*/])(?:[^/[\\]|\\[\S\s]|\[(?:[^\\\]]|\\[\S\s])*(?:]|$))+\/)/]);(h=a.types)&&e.push(["typ",h]);a=(""+a.keywords).replace(/^ | $/g, 12 | "");a.length&&e.push(["kwd",RegExp("^(?:"+a.replace(/[\s,]+/g,"|")+")\\b"),q]);m.push(["pln",/^\s+/,q," \r\n\t\xa0"]);e.push(["lit",/^@[$_a-z][\w$@]*/i,q],["typ",/^(?:[@_]?[A-Z]+[a-z][\w$@]*|\w+_t\b)/,q],["pln",/^[$_a-z][\w$@]*/i,q],["lit",/^(?:0x[\da-f]+|(?:\d(?:_\d+)*\d*(?:\.\d*)?|\.\d\+)(?:e[+-]?\d+)?)[a-z]*/i,q,"0123456789"],["pln",/^\\[\S\s]?/,q],["pun",/^.[^\s\w"-$'./@\\`]*/,q]);return x(m,e)}function D(a,m){function e(a){switch(a.nodeType){case 1:if(k.test(a.className))break;if("BR"===a.nodeName)h(a), 13 | a.parentNode&&a.parentNode.removeChild(a);else for(a=a.firstChild;a;a=a.nextSibling)e(a);break;case 3:case 4:if(p){var b=a.nodeValue,d=b.match(t);if(d){var c=b.substring(0,d.index);a.nodeValue=c;(b=b.substring(d.index+d[0].length))&&a.parentNode.insertBefore(s.createTextNode(b),a.nextSibling);h(a);c||a.parentNode.removeChild(a)}}}}function h(a){function b(a,d){var e=d?a.cloneNode(!1):a,f=a.parentNode;if(f){var f=b(f,1),g=a.nextSibling;f.appendChild(e);for(var h=g;h;h=g)g=h.nextSibling,f.appendChild(h)}return e} 14 | for(;!a.nextSibling;)if(a=a.parentNode,!a)return;for(var a=b(a.nextSibling,0),e;(e=a.parentNode)&&e.nodeType===1;)a=e;d.push(a)}var k=/(?:^|\s)nocode(?:\s|$)/,t=/\r\n?|\n/,s=a.ownerDocument,l;a.currentStyle?l=a.currentStyle.whiteSpace:window.getComputedStyle&&(l=s.defaultView.getComputedStyle(a,q).getPropertyValue("white-space"));var p=l&&"pre"===l.substring(0,3);for(l=s.createElement("LI");a.firstChild;)l.appendChild(a.firstChild);for(var d=[l],g=0;g=0;){var h=m[e];A.hasOwnProperty(h)?window.console&&console.warn("cannot override language handler %s",h):A[h]=a}}function C(a,m){if(!a||!A.hasOwnProperty(a))a=/^\s*=o&&(h+=2);e>=c&&(a+=2)}}catch(w){"console"in window&&console.log(w&&w.stack?w.stack:w)}}var v=["break,continue,do,else,for,if,return,while"],w=[[v,"auto,case,char,const,default,double,enum,extern,float,goto,int,long,register,short,signed,sizeof,static,struct,switch,typedef,union,unsigned,void,volatile"], 18 | "catch,class,delete,false,import,new,operator,private,protected,public,this,throw,true,try,typeof"],F=[w,"alignof,align_union,asm,axiom,bool,concept,concept_map,const_cast,constexpr,decltype,dynamic_cast,explicit,export,friend,inline,late_check,mutable,namespace,nullptr,reinterpret_cast,static_assert,static_cast,template,typeid,typename,using,virtual,where"],G=[w,"abstract,boolean,byte,extends,final,finally,implements,import,instanceof,null,native,package,strictfp,super,synchronized,throws,transient"], 19 | H=[G,"as,base,by,checked,decimal,delegate,descending,dynamic,event,fixed,foreach,from,group,implicit,in,interface,internal,into,is,lock,object,out,override,orderby,params,partial,readonly,ref,sbyte,sealed,stackalloc,string,select,uint,ulong,unchecked,unsafe,ushort,var"],w=[w,"debugger,eval,export,function,get,null,set,undefined,var,with,Infinity,NaN"],I=[v,"and,as,assert,class,def,del,elif,except,exec,finally,from,global,import,in,is,lambda,nonlocal,not,or,pass,print,raise,try,with,yield,False,True,None"], 20 | J=[v,"alias,and,begin,case,class,def,defined,elsif,end,ensure,false,in,module,next,nil,not,or,redo,rescue,retry,self,super,then,true,undef,unless,until,when,yield,BEGIN,END"],v=[v,"case,done,elif,esac,eval,fi,function,in,local,set,then,until"],K=/^(DIR|FILE|vector|(de|priority_)?queue|list|stack|(const_)?iterator|(multi)?(set|map)|bitset|u?(int|float)\d*)/,N=/\S/,O=u({keywords:[F,H,w,"caller,delete,die,do,dump,elsif,eval,exit,foreach,for,goto,if,import,last,local,my,next,no,our,print,package,redo,require,sub,undef,unless,until,use,wantarray,while,BEGIN,END"+ 21 | I,J,v],hashComments:!0,cStyleComments:!0,multiLineStrings:!0,regexLiterals:!0}),A={};k(O,["default-code"]);k(x([],[["pln",/^[^]*(?:>|$)/],["com",/^<\!--[\S\s]*?(?:--\>|$)/],["lang-",/^<\?([\S\s]+?)(?:\?>|$)/],["lang-",/^<%([\S\s]+?)(?:%>|$)/],["pun",/^(?:<[%?]|[%?]>)/],["lang-",/^]*>([\S\s]+?)<\/xmp\b[^>]*>/i],["lang-js",/^]*>([\S\s]*?)(<\/script\b[^>]*>)/i],["lang-css",/^]*>([\S\s]*?)(<\/style\b[^>]*>)/i],["lang-in.tag",/^(<\/?[a-z][^<>]*>)/i]]), 22 | ["default-markup","htm","html","mxml","xhtml","xml","xsl"]);k(x([["pln",/^\s+/,q," \t\r\n"],["atv",/^(?:"[^"]*"?|'[^']*'?)/,q,"\"'"]],[["tag",/^^<\/?[a-z](?:[\w-.:]*\w)?|\/?>$/i],["atn",/^(?!style[\s=]|on)[a-z](?:[\w:-]*\w)?/i],["lang-uq.val",/^=\s*([^\s"'>]*(?:[^\s"'/>]|\/(?=\s)))/],["pun",/^[/<->]+/],["lang-js",/^on\w+\s*=\s*"([^"]+)"/i],["lang-js",/^on\w+\s*=\s*'([^']+)'/i],["lang-js",/^on\w+\s*=\s*([^\s"'>]+)/i],["lang-css",/^style\s*=\s*"([^"]+)"/i],["lang-css",/^style\s*=\s*'([^']+)'/i],["lang-css", 23 | /^style\s*=\s*([^\s"'>]+)/i]]),["in.tag"]);k(x([],[["atv",/^[\S\s]+/]]),["uq.val"]);k(u({keywords:F,hashComments:!0,cStyleComments:!0,types:K}),["c","cc","cpp","cxx","cyc","m"]);k(u({keywords:"null,true,false"}),["json"]);k(u({keywords:H,hashComments:!0,cStyleComments:!0,verbatimStrings:!0,types:K}),["cs"]);k(u({keywords:G,cStyleComments:!0}),["java"]);k(u({keywords:v,hashComments:!0,multiLineStrings:!0}),["bsh","csh","sh"]);k(u({keywords:I,hashComments:!0,multiLineStrings:!0,tripleQuotedStrings:!0}), 24 | ["cv","py"]);k(u({keywords:"caller,delete,die,do,dump,elsif,eval,exit,foreach,for,goto,if,import,last,local,my,next,no,our,print,package,redo,require,sub,undef,unless,until,use,wantarray,while,BEGIN,END",hashComments:!0,multiLineStrings:!0,regexLiterals:!0}),["perl","pl","pm"]);k(u({keywords:J,hashComments:!0,multiLineStrings:!0,regexLiterals:!0}),["rb"]);k(u({keywords:w,cStyleComments:!0,regexLiterals:!0}),["js"]);k(u({keywords:"all,and,by,catch,class,else,extends,false,finally,for,if,in,is,isnt,loop,new,no,not,null,of,off,on,or,return,super,then,true,try,unless,until,when,while,yes", 25 | hashComments:3,cStyleComments:!0,multilineStrings:!0,tripleQuotedStrings:!0,regexLiterals:!0}),["coffee"]);k(x([],[["str",/^[\S\s]+/]]),["regex"]);window.prettyPrintOne=function(a,m,e){var h=document.createElement("PRE");h.innerHTML=a;e&&D(h,e);E({g:m,i:e,h:h});return h.innerHTML};window.prettyPrint=function(a){function m(){for(var e=window.PR_SHOULD_USE_CONTINUATION?l.now()+250:Infinity;p=0){var k=k.match(g),f,b;if(b= 26 | !k){b=n;for(var o=void 0,c=b.firstChild;c;c=c.nextSibling)var i=c.nodeType,o=i===1?o?b:c:i===3?N.test(c.nodeValue)?b:o:o;b=(f=o===b?void 0:o)&&"CODE"===f.tagName}b&&(k=f.className.match(g));k&&(k=k[1]);b=!1;for(o=n.parentNode;o;o=o.parentNode)if((o.tagName==="pre"||o.tagName==="code"||o.tagName==="xmp")&&o.className&&o.className.indexOf("prettyprint")>=0){b=!0;break}b||((b=(b=n.className.match(/\blinenums\b(?::(\d+))?/))?b[1]&&b[1].length?+b[1]:!0:!1)&&D(n,b),d={g:k,h:n,i:b},E(d))}}p li:before { 39 | content: counter(customlistcounter) " "; 40 | counter-increment: customlistcounter; 41 | float: left; 42 | width: 2em; 43 | text-align: right; 44 | padding-left: 5px; 45 | padding-right: 5px; 46 | } 47 | 48 | ol.linenums:first-child { 49 | counter-reset: customlistcounter; 50 | } 51 | 52 | ol.linenums { 53 | *list-style-type: decimal; /* targets IE6 and IE7 only */ 54 | } 55 | 56 | li.L0, 57 | li.L1, 58 | li.L2, 59 | li.L3, 60 | li.L4, 61 | li.L5, 62 | li.L6, 63 | li.L7, 64 | li.L8, 65 | li.L9 { color: #8F7E65 } 66 | 67 | pre.prettyprint { 68 | padding: 0.5em; 69 | white-space: pre-wrap; 70 | border: 0px none; 71 | } 72 | -------------------------------------------------------------------------------- /output/assets/printable.css: -------------------------------------------------------------------------------- 1 | /* Minimal font */ 2 | body, div, dl, dt, dd, ul, ol, li, h1, h2, h3, h4, h5, h6, pre, 3 | form, fieldset, input, p, blockquote, table, th, td, embed, object, hr { 4 | padding: 0; 5 | margin: 0; 6 | } 7 | table { 8 | border-collapse: collapse; 9 | border-spacing: 0; 10 | } 11 | fieldset, img, abbr { 12 | border: 0; 13 | } 14 | address, caption, cite, code, dfn, em, 15 | h1, h2, h3, h4, h5, h6, strong, th, var { 16 | font-weight: normal; 17 | font-style: normal; 18 | } 19 | ul { 20 | list-style: none; 21 | } 22 | caption, th { 23 | text-align: left; 24 | } 25 | h1, h2, h3, h4, h5, h6 { 26 | font-size: 1.0em; 27 | } 28 | q:before, q:after { 29 | content: ''; 30 | } 31 | a, ins { 32 | text-decoration: none; 33 | } 34 | 35 | body { 36 | font-family: 'Open Sans', sans-serif; 37 | color: black; 38 | background-color: #FAFAFA; 39 | } 40 | 41 | a { 42 | color: #c00; 43 | } 44 | 45 | hr { 46 | color: #ededed; 47 | background-color: #ededed; 48 | height: 1px; 49 | border: none; 50 | width: 88%; 51 | margin: 18px auto; 52 | } 53 | 54 | /* Header */ 55 | .header { 56 | /* 57 | margin-top: 2.029em; 58 | margin-left: auto; 59 | margin-right: auto; 60 | height: 4em; 61 | */ 62 | /* 63 | width: 940px; 64 | position: relative; 65 | */ 66 | width: 700px; 67 | margin: 0 auto; 68 | } 69 | 70 | .header h1 { 71 | font-size: 37px; 72 | font-weight: normal; 73 | line-height: 48px; 74 | } 75 | 76 | .nav, .nav a { 77 | color: #ccc; 78 | } 79 | 80 | #brand { 81 | /* 82 | position: absolute; 83 | bottom: 0; 84 | left: 15px; 85 | */ 86 | } 87 | 88 | #navi { 89 | /* 90 | position: absolute; 91 | right:0; 92 | bottom:0; 93 | padding-right: 38px; 94 | */ 95 | 96 | display: none; 97 | } 98 | 99 | #navi li { 100 | display: inline; 101 | list-style-type: none; 102 | padding-right: 20px; 103 | } 104 | 105 | div.clear { 106 | clear: both; 107 | } 108 | 109 | div.clear hr { 110 | /* 111 | margin: 18px 0px; 112 | */ 113 | } 114 | 115 | #wrapper { 116 | /* width: 940px;*/ 117 | width: 800px; 118 | 119 | padding: 75px 0; 120 | 121 | position: relative; 122 | margin: 0 auto; 123 | 124 | background: #FAFBFC; 125 | box-shadow: 0 1px 6px rgba(0, 0, 0, 0.15); 126 | -moz-box-shadow: 0 0 6px rgba(0, 0, 0, 0.15); 127 | -webkit-box-shadow: 0 1px 6px rgba(0, 0, 0, 0.15); 128 | -o-box-shadow: 0 0 6px rgba(0, 0, 0, 0.15); 129 | 130 | } 131 | 132 | /* Content */ 133 | #content { 134 | font-size: 16px; 135 | line-height: 27px; 136 | 137 | /* 138 | margin-bottom: 0; 139 | padding-bottom: 20px; 140 | float: left; 141 | */ 142 | /* 143 | margin: 0 auto; 144 | width: 600px; 145 | */ 146 | 147 | overflow: visible; 148 | } 149 | 150 | #sidebar { 151 | width: 24%; 152 | float: left; 153 | } 154 | 155 | /* Date widget */ 156 | #content small{ 157 | font-size: 10px; 158 | color: #CCC; 159 | padding: 10px 20px 5px 0px; 160 | float: left; 161 | text-transform: uppercase; 162 | } 163 | #content span.date { 164 | font-size: 32px; 165 | line-height: 0.8; 166 | color: #C00; 167 | } 168 | 169 | /* Post titles */ 170 | 171 | .post h1, .post h2, .post h3, .post h4, .post h5 { 172 | font-family: 'Open Sans', sans-serif; 173 | font-weight: 400; 174 | 175 | margin: 0 auto 0.5em auto; 176 | width: 700px; 177 | 178 | } 179 | 180 | .post h1 { 181 | font-size: 36px; 182 | line-height: 1; 183 | } 184 | 185 | .post h2 { 186 | font-size: 30px; 187 | text-align: left; 188 | line-height: 32px; 189 | margin-bottom: 15px; 190 | } 191 | 192 | .post h2.hl { 193 | background-color: #FFEA00; 194 | padding: 5px 5px 5px 10px; 195 | margin-left: 5px; 196 | 197 | box-shadow: #EAEAEA 3px 3px; 198 | } 199 | 200 | .post p { 201 | margin: 1.5em auto; 202 | width: 700px; 203 | /* max-width: 580px; */ 204 | } 205 | 206 | .post ul { 207 | list-style-type: disc; 208 | } 209 | 210 | .post ul, .post ol { 211 | margin: 0.5em auto; 212 | width: 700px; 213 | 214 | padding-left: 3.333em; 215 | /* max-width: 580px; */ 216 | } 217 | 218 | .post ul ul { 219 | width: 540px; 220 | } 221 | 222 | .post ul ul ul { 223 | width: 480px; 224 | } 225 | 226 | .post pre ul, .post pre ol { 227 | margin: 0; 228 | padding: 0; 229 | } 230 | 231 | .post pre { 232 | padding: 2em 4em; 233 | vertical-align: top; 234 | margin: 0 auto; 235 | 236 | /* 237 | background-color: #002B36; 238 | color: #93A1A1; 239 | */ 240 | background-color: #FDF6E3; 241 | color: #586E75; 242 | } 243 | 244 | .post pre, .post code, .post tt { 245 | font: 0.8em 'lucida console', monospace; 246 | line-height: 1.5; 247 | } 248 | 249 | .post h3 { 250 | font-size: 1.5em; 251 | line-height: 1; 252 | margin-bottom: .5em; 253 | padding-bottom: .5em; 254 | } 255 | 256 | .post h4 { 257 | font-size: 1.2em; 258 | line-height: 1.25; 259 | margin-bottom: 1.25em; 260 | } 261 | 262 | .post p tt, .post p code { 263 | background: ghostWhite; 264 | border: 1px solid #DEDEDE; 265 | padding: 0 0.2em; 266 | } 267 | 268 | 269 | 270 | #sidebar h2 { 271 | font-size: 18px; 272 | padding: 5px 0 10px 0; 273 | } 274 | 275 | #sidebar h3 { 276 | font-size: 1.5em; 277 | line-height: 1; 278 | border-bottom: 1px solid #CCC; 279 | margin-bottom: .5em; 280 | padding-bottom: .5em; 281 | } 282 | 283 | 284 | table.table { 285 | max-width: 580px; 286 | border-collapse: collapse; 287 | border-spacing: 0; 288 | margin-bottom: 1em; 289 | margin-left: 15px; 290 | background: #F6F6F6 repeat 0 0; 291 | padding: .5em 1em; 292 | } 293 | table.table caption { text-align: center; } 294 | table.table th { font-weight: bold; } 295 | table.table th, table.table td { padding: 0.3em; vertical-align: top; border: solid 0.1em; } 296 | 297 | div.summary { 298 | border-top: 3px solid #EEE; 299 | border-bottom: 3px solid #EEE; 300 | width: 580px; 301 | color: #666; 302 | margin-top: 2em; 303 | margin-bottom: 1em; 304 | padding-top: 1em; 305 | } 306 | 307 | div.summary ul { 308 | margin: 0 1.5em 1em 0; 309 | } 310 | 311 | span.ref { 312 | font-size: small; 313 | color: #888; 314 | } 315 | 316 | span.ref a { 317 | color: #666; 318 | } 319 | div.notice { 320 | border: 1px solid #E8E8E8; background-color: #FBFBDD; padding: .5em 1em; 321 | } 322 | 323 | blockquote { 324 | /* background: #FF4342; */ 325 | background-color: #D82545; 326 | padding: 30px 30px 1px 60px; 327 | margin: 0 0 30px 0; 328 | font-size: 26px; 329 | line-height: 32px; 330 | text-align: right; 331 | color: #EDF0EA; 332 | -webkit-box-sizing: border-box; 333 | -moz-box-sizing: border-box; 334 | box-sizing: border-box; 335 | /* 336 | display: block; 337 | border-left: 4px solid #4D4D4D; 338 | padding-left: 12px; 339 | line-height: 25px; 340 | margin-top: 0; 341 | margin-bottom: 20px; 342 | margin-right: 6px; 343 | margin-left: 0; 344 | color: #4D4D4D; 345 | */ 346 | /* max-width: 580px; */ 347 | } 348 | 349 | 350 | 351 | .code { 352 | font-family: 'lucida console', monospace; 353 | font-size: 14px; 354 | display: block; 355 | /* max-width: 440px; */ 356 | 357 | 358 | overflow-x: auto; 359 | 360 | background-color: white; 361 | } 362 | .code::-webkit-scrollbar { 363 | width: 6px; 364 | height: 10px; 365 | padding: 18px; 366 | -webkit-border-radius: 1ex 367 | } 368 | 369 | .code::-webkit-scrollbar-thumb { 370 | background-color: rgba(053,057,071,0.3); 371 | padding: 8px; 372 | width: 6px; 373 | height: 6px; 374 | -webkit-border-radius: 1ex; 375 | } 376 | 377 | .code::-webkit-scrollbar-button:start:decrement,pre::-webkit-scrollbar-button:end:increment { 378 | display: block; 379 | height: 10px 380 | } 381 | 382 | .code::-webkit-scrollbar-thumb:vertical, pre::-webkit-scrollbar-thumb:horizontal { 383 | height: 3px; 384 | width: 3px; 385 | margin: 3px; 386 | } 387 | 388 | .code div { 389 | border-left: 6px solid #ebebeb; 390 | white-space: pre; 391 | padding-left: 6px; 392 | } 393 | 394 | .code div:hover { 395 | background-color: lemonChiffon; 396 | } 397 | 398 | .code .hl { 399 | background-color: lemonChiffon; 400 | } 401 | 402 | .code .hl:hover { 403 | background-color: lightGray; 404 | } 405 | 406 | .green { 407 | border-left: 6px solid #93C763 !important; 408 | } 409 | 410 | .blue { 411 | border-left: 6px solid #3074D5 !important; 412 | } 413 | 414 | .red { 415 | border-left: 6px solid #963A46 !important; 416 | } 417 | 418 | .orange { 419 | border-left: 6px solid #EC7600 !important; 420 | } 421 | 422 | .yellow { 423 | border-left: 6px solid #ffea00 !important; 424 | } 425 | 426 | .purple { 427 | border-left: 6px solid #A082BD !important; 428 | } 429 | 430 | .right .green, .right .blue, .right .red, .right .orange, 431 | .right .yellow, .right .purple { 432 | margin-left: 18px; 433 | background-color: white; 434 | padding: 1px; 435 | } 436 | 437 | .right p { 438 | margin-bottom: 1em; 439 | } 440 | 441 | .left { 442 | padding-top: 30px; 443 | padding-bottom: 15px; 444 | 445 | background: white; 446 | /* border-right: 1px solid #E5E5EE; */ 447 | vertical-align: top; 448 | padding-left: 15px; 449 | 450 | } 451 | 452 | .right { 453 | padding-top: 30px; 454 | padding-bottom: 15px; 455 | 456 | vertical-align: top; 457 | } 458 | 459 | 460 | tbody { 461 | vertical-align: top; 462 | } 463 | 464 | div .ref { 465 | border: 1px solid #E8E8E8; 466 | background-color: #FBFBDD; 467 | padding: .5em 1em; 468 | /* max-width: 580px; */ 469 | } 470 | 471 | em, b { 472 | background: #FFF198; 473 | color: #222; 474 | } 475 | 476 | *::-moz-selection, *::-webkit-selection, ::selection, .highlight { 477 | background: #FFF198; 478 | color: #222; 479 | } 480 | 481 | 482 | dl { 483 | padding: 3em 2em; 484 | background-color: #68A8C3; 485 | } 486 | dt { 487 | float: left; 488 | clear: left; 489 | width: 140px; 490 | text-align: right; 491 | font-weight: bold; 492 | color: #3A6073; 493 | 494 | } 495 | 496 | dd { 497 | margin: 0 0 0 170px; 498 | padding: 0 0 0.5em 0; 499 | color: white; 500 | } 501 | 502 | .post .img-container { 503 | background-color: white; 504 | width: 100%; 505 | padding: 30px 0px; 506 | border-top: 1px solid #EDEDED; 507 | border-bottom: 1px solid #EDEDED; 508 | } 509 | 510 | img { 511 | margin: auto; 512 | display: block; 513 | } 514 | 515 | dd a { 516 | color: white; 517 | text-decoration: underline; 518 | } 519 | 520 | dt a { 521 | color: #3A6073; 522 | text-decoration: underline; 523 | } 524 | 525 | 526 | .post table { 527 | margin: 1.5em auto; 528 | width: 600px; 529 | } 530 | 531 | .post blockquote p code { 532 | background: transparent; 533 | border: none; 534 | font-style: italic; 535 | padding: 0; 536 | } 537 | 538 | .post blockquote p a { 539 | color: white; 540 | } 541 | 542 | .post blockquote p a:hover { 543 | text-decoration: underline; 544 | } 545 | 546 | #main img { 547 | margin: auto; 548 | display: block; 549 | max-width: 600px; 550 | } 551 | 552 | .fb-like { 553 | float: left; 554 | padding: 0px 8px 8px 0px; 555 | } 556 | 557 | .inline { 558 | margin: 0; 559 | display: inline; 560 | } 561 | -------------------------------------------------------------------------------- /output/assets/quote_colors.js: -------------------------------------------------------------------------------- 1 | $(function() { 2 | // fix unescaped chars 3 | $('pre').each(function(i, el) { 4 | $(this).html($(this).html().replace(//g, '>')); 5 | }); 6 | // resize text 7 | $('blockquote p:not(".special")').each(function(i, el) { 8 | var length = el.innerText.length; 9 | if(length > 45) { 10 | $(el).css({ fontSize: '24px', lineHeight: '140%' }); 11 | } 12 | if(length > 100) { 13 | $(el).css({ fontSize: '22px', lineHeight: '140%'}); 14 | // light-beige 15 | $(el).parent().css({ backgroundColor: '#f6f0d8', color: '#424242' }); 16 | } 17 | if(length > 130) { 18 | $(el).css({ fontSize: '20px', lineHeight: '140%' }); 19 | // light-green 20 | $(el).parent().css({ backgroundColor: '#9ec9ac', color: '#424242' }); 21 | } 22 | }); 23 | }); 24 | -------------------------------------------------------------------------------- /output/assets/runner.js: -------------------------------------------------------------------------------- 1 | var dummy = { 2 | console: function(elementId) { 3 | return { 4 | log: function() { 5 | var msg = ""; 6 | for ( var i = 0; i < arguments.length; i++ ) { 7 | msg += " " + JSON.stringify(arguments[i]); 8 | } 9 | jQuery("#results_"+elementId).append("
  • LOG " + msg + "
  • "); 10 | }, 11 | error: function(msg){ 12 | jQuery("#results_"+elementId).append("
  • ERROR " + msg + "
  • "); 13 | } 14 | } 15 | } 16 | }; 17 | 18 | function run(id) { 19 | try { 20 | var str = 'var console = dummy.console("'+id+'"); ' + jQuery('#block_'+id).text(); 21 | (new Function('dummy', str ))(dummy); 22 | } catch(e){ 23 | var d = dummy.console(id); 24 | d.error(e.message); 25 | } 26 | } 27 | 28 | $(document).ready(function() { 29 | var i = 1; 30 | jQuery(".run").each(function(idx, elem) { 31 | jQuery(elem).attr('id','block_'+i).after('
      '); 32 | i++; 33 | }); 34 | prettyPrint(); 35 | }); 36 | -------------------------------------------------------------------------------- /output/assets/style.css: -------------------------------------------------------------------------------- 1 | /* Minimal font */ 2 | body, div, dl, dt, dd, ul, ol, li, h1, h2, h3, h4, h5, h6, pre, 3 | form, fieldset, input, p, blockquote, table, th, td, embed, object, hr { 4 | padding: 0; 5 | margin: 0; 6 | } 7 | table { 8 | border-collapse: collapse; 9 | border-spacing: 0; 10 | } 11 | fieldset, img, abbr { 12 | border: 0; 13 | } 14 | address, caption, cite, code, dfn, em, 15 | h1, h2, h3, h4, h5, h6, strong, th, var { 16 | font-weight: normal; 17 | font-style: normal; 18 | } 19 | ul { 20 | list-style: none; 21 | } 22 | caption, th { 23 | text-align: left; 24 | } 25 | h1, h2, h3, h4, h5, h6 { 26 | font-size: 1.0em; 27 | } 28 | q:before, q:after { 29 | content: ''; 30 | } 31 | a, ins { 32 | text-decoration: none; 33 | } 34 | 35 | body { 36 | font-family: 'Open Sans', sans-serif; 37 | color: #444; 38 | background-color: #F4F4F4; 39 | background-image: url(bgnoise.png); 40 | } 41 | 42 | a { 43 | color: #c00; 44 | } 45 | 46 | hr { 47 | color: #ededed; 48 | background-color: #ededed; 49 | height: 1px; 50 | border: none; 51 | width: 88%; 52 | margin: 18px auto; 53 | } 54 | 55 | /* Header */ 56 | .header { 57 | /* 58 | margin-top: 2.029em; 59 | margin-left: auto; 60 | margin-right: auto; 61 | height: 4em; 62 | */ 63 | /* 64 | width: 940px; 65 | position: relative; 66 | */ 67 | width: 600px; 68 | margin: 0 auto; 69 | } 70 | 71 | .header h1 { 72 | font-size: 37px; 73 | font-weight: normal; 74 | line-height: 48px; 75 | } 76 | 77 | .nav, .nav a { 78 | color: #999; 79 | } 80 | 81 | .nav a:hover { 82 | color: #c00; 83 | } 84 | 85 | #brand { 86 | /* 87 | position: absolute; 88 | bottom: 0; 89 | left: 15px; 90 | */ 91 | } 92 | 93 | #navi { 94 | /* 95 | position: absolute; 96 | right:0; 97 | bottom:0; 98 | padding-right: 38px; 99 | */ 100 | 101 | display: none; 102 | } 103 | 104 | #navi li { 105 | display: inline; 106 | list-style-type: none; 107 | padding-right: 20px; 108 | } 109 | 110 | div.clear { 111 | clear: both; 112 | } 113 | 114 | div.clear hr { 115 | /* 116 | margin: 18px 0px; 117 | */ 118 | } 119 | 120 | #wrapper { 121 | /* width: 940px;*/ 122 | width: 700px; 123 | 124 | padding: 75px 0; 125 | 126 | position: relative; 127 | margin: 0 auto; 128 | 129 | background: #FAFBFC; 130 | box-shadow: 0 1px 6px rgba(0, 0, 0, 0.15); 131 | -moz-box-shadow: 0 0 6px rgba(0, 0, 0, 0.15); 132 | -webkit-box-shadow: 0 1px 6px rgba(0, 0, 0, 0.15); 133 | -o-box-shadow: 0 0 6px rgba(0, 0, 0, 0.15); 134 | 135 | } 136 | 137 | /* Content */ 138 | #content { 139 | font-size: 16px; 140 | line-height: 27px; 141 | 142 | /* 143 | margin-bottom: 0; 144 | padding-bottom: 20px; 145 | float: left; 146 | */ 147 | /* 148 | margin: 0 auto; 149 | width: 600px; 150 | */ 151 | 152 | overflow: visible; 153 | } 154 | 155 | #sidebar { 156 | width: 24%; 157 | float: left; 158 | } 159 | 160 | /* Date widget */ 161 | #content small{ 162 | font-size: 10px; 163 | color: #CCC; 164 | padding: 10px 20px 5px 0px; 165 | float: left; 166 | text-transform: uppercase; 167 | } 168 | #content span.date { 169 | font-size: 32px; 170 | line-height: 0.8; 171 | color: #C00; 172 | } 173 | 174 | /* Post titles */ 175 | 176 | .post h1, .post h2, .post h3, .post h4, .post h5 { 177 | font-family: 'Open Sans', sans-serif; 178 | font-weight: 400; 179 | 180 | margin: 0.4em auto 0.5em auto; 181 | width: 600px; 182 | 183 | } 184 | 185 | .post h1 { 186 | font-size: 36px; 187 | line-height: 1; 188 | } 189 | 190 | .post h2 { 191 | font-size: 30px; 192 | text-align: left; 193 | line-height: 32px; 194 | margin-bottom: 15px; 195 | } 196 | 197 | .post h2.hl { 198 | background-color: #FFEA00; 199 | padding: 5px 5px 5px 10px; 200 | margin-left: 5px; 201 | 202 | box-shadow: #EAEAEA 3px 3px; 203 | } 204 | 205 | .post p { 206 | margin: 1.5em auto; 207 | width: 600px; 208 | /* max-width: 580px; */ 209 | } 210 | 211 | .post ul { 212 | list-style-type: disc; 213 | } 214 | 215 | .post ul, .post ol { 216 | margin: 0.5em auto; 217 | width: 600px; 218 | 219 | padding-left: 3.333em; 220 | /* max-width: 580px; */ 221 | } 222 | 223 | .post ul ul { 224 | width: 540px; 225 | } 226 | 227 | .post ul ul ul { 228 | width: 480px; 229 | } 230 | 231 | .post pre ul, .post pre ol { 232 | margin: 0; 233 | padding: 0; 234 | } 235 | 236 | .post pre { 237 | padding: 2em 4em; 238 | vertical-align: top; 239 | margin: 0 auto; 240 | 241 | /* 242 | background-color: #002B36; 243 | color: #93A1A1; 244 | */ 245 | background-color: #FDF6E3; 246 | color: #586E75; 247 | } 248 | 249 | .post pre, .post code, .post tt { 250 | font: 0.8em 'lucida console', monospace; 251 | line-height: 1.5; 252 | } 253 | 254 | .post h3 { 255 | font-size: 1.5em; 256 | line-height: 1; 257 | margin-bottom: .5em; 258 | padding-bottom: .5em; 259 | } 260 | 261 | .post h4 { 262 | font-size: 1.2em; 263 | line-height: 1.25; 264 | margin-bottom: 1.25em; 265 | } 266 | 267 | .post p tt, .post p code, .post li code { 268 | background: ghostWhite; 269 | border: 1px solid #DEDEDE; 270 | padding: 0 0.2em; 271 | } 272 | 273 | .post blockquote p code { 274 | background: transparent; 275 | border: none; 276 | font-style: italic; 277 | padding: 0; 278 | } 279 | 280 | .post blockquote p a { 281 | color: white; 282 | } 283 | 284 | .post blockquote p a:hover { 285 | text-decoration: underline; 286 | } 287 | 288 | 289 | #sidebar h2 { 290 | font-size: 18px; 291 | padding: 5px 0 10px 0; 292 | } 293 | 294 | #sidebar h3 { 295 | font-size: 1.5em; 296 | line-height: 1; 297 | border-bottom: 1px solid #CCC; 298 | margin-bottom: .5em; 299 | padding-bottom: .5em; 300 | } 301 | 302 | 303 | table.table { 304 | max-width: 580px; 305 | border-collapse: collapse; 306 | border-spacing: 0; 307 | margin-bottom: 1em; 308 | margin-left: 15px; 309 | background: #F6F6F6 repeat 0 0; 310 | padding: .5em 1em; 311 | } 312 | table.table caption { text-align: center; } 313 | table.table th { font-weight: bold; } 314 | table.table th, table.table td { padding: 0.3em; vertical-align: top; border: solid 0.1em; } 315 | 316 | div.summary { 317 | border-top: 3px solid #EEE; 318 | border-bottom: 3px solid #EEE; 319 | width: 580px; 320 | color: #666; 321 | margin-top: 2em; 322 | margin-bottom: 1em; 323 | padding-top: 1em; 324 | } 325 | 326 | div.summary ul { 327 | margin: 0 1.5em 1em 0; 328 | } 329 | 330 | span.ref { 331 | font-size: small; 332 | color: #888; 333 | } 334 | 335 | span.ref a { 336 | color: #666; 337 | } 338 | div.notice { 339 | border: 1px solid #E8E8E8; background-color: #FBFBDD; padding: .5em 1em; 340 | } 341 | 342 | blockquote { 343 | /* background: #FF4342; */ 344 | background-color: #D82545; 345 | padding: 30px 30px 1px 60px; 346 | margin: 0 0 30px 0; 347 | /* 348 | font-size: 26px; 349 | line-height: 32px; 350 | */ 351 | font-size: 20px; 352 | line-height: 140%; 353 | 354 | text-align: right; 355 | color: #EDF0EA; 356 | -webkit-box-sizing: border-box; 357 | -moz-box-sizing: border-box; 358 | box-sizing: border-box; 359 | /* 360 | display: block; 361 | border-left: 4px solid #4D4D4D; 362 | padding-left: 12px; 363 | line-height: 25px; 364 | margin-top: 0; 365 | margin-bottom: 20px; 366 | margin-right: 6px; 367 | margin-left: 0; 368 | color: #4D4D4D; 369 | */ 370 | /* max-width: 580px; */ 371 | } 372 | 373 | 374 | 375 | .code { 376 | font-family: 'lucida console', monospace; 377 | font-size: 14px; 378 | display: block; 379 | /* max-width: 440px; */ 380 | 381 | 382 | overflow-x: auto; 383 | 384 | background-color: white; 385 | } 386 | .code::-webkit-scrollbar { 387 | width: 6px; 388 | height: 10px; 389 | padding: 18px; 390 | -webkit-border-radius: 1ex 391 | } 392 | 393 | .code::-webkit-scrollbar-thumb { 394 | background-color: rgba(053,057,071,0.3); 395 | padding: 8px; 396 | width: 6px; 397 | height: 6px; 398 | -webkit-border-radius: 1ex; 399 | } 400 | 401 | .code::-webkit-scrollbar-button:start:decrement,pre::-webkit-scrollbar-button:end:increment { 402 | display: block; 403 | height: 10px 404 | } 405 | 406 | .code::-webkit-scrollbar-thumb:vertical, pre::-webkit-scrollbar-thumb:horizontal { 407 | height: 3px; 408 | width: 3px; 409 | margin: 3px; 410 | } 411 | 412 | .code div { 413 | border-left: 6px solid #ebebeb; 414 | white-space: pre; 415 | padding-left: 6px; 416 | } 417 | 418 | .code div:hover { 419 | background-color: lemonChiffon; 420 | } 421 | 422 | .code .hl { 423 | background-color: lemonChiffon; 424 | } 425 | 426 | .code .hl:hover { 427 | background-color: lightGray; 428 | } 429 | 430 | .green { 431 | border-left: 6px solid #93C763 !important; 432 | } 433 | 434 | .blue { 435 | border-left: 6px solid #3074D5 !important; 436 | } 437 | 438 | .red { 439 | border-left: 6px solid #963A46 !important; 440 | } 441 | 442 | .orange { 443 | border-left: 6px solid #EC7600 !important; 444 | } 445 | 446 | .yellow { 447 | border-left: 6px solid #ffea00 !important; 448 | } 449 | 450 | .purple { 451 | border-left: 6px solid #A082BD !important; 452 | } 453 | 454 | .right .green, .right .blue, .right .red, .right .orange, 455 | .right .yellow, .right .purple { 456 | margin-left: 18px; 457 | background-color: white; 458 | padding: 1px; 459 | } 460 | 461 | .right p { 462 | margin-bottom: 1em; 463 | } 464 | 465 | .left { 466 | padding-top: 30px; 467 | padding-bottom: 15px; 468 | 469 | background: white; 470 | /* border-right: 1px solid #E5E5EE; */ 471 | vertical-align: top; 472 | padding-left: 15px; 473 | 474 | } 475 | 476 | .right { 477 | padding-top: 30px; 478 | padding-bottom: 15px; 479 | 480 | vertical-align: top; 481 | } 482 | 483 | 484 | tbody { 485 | vertical-align: top; 486 | } 487 | 488 | div .ref { 489 | border: 1px solid #E8E8E8; 490 | background-color: #FBFBDD; 491 | padding: .5em 1em; 492 | /* max-width: 580px; */ 493 | } 494 | 495 | em, b { 496 | background: #FFF198; 497 | color: #222; 498 | } 499 | 500 | *::-moz-selection, *::-webkit-selection, ::selection, .highlight { 501 | background: #FFF198; 502 | color: #222; 503 | } 504 | 505 | /* 506 | dl { 507 | padding: 3em 2em; 508 | background-color: #68A8C3; 509 | } 510 | dt { 511 | float: left; 512 | clear: left; 513 | width: 140px; 514 | text-align: right; 515 | font-weight: bold; 516 | color: #3A6073; 517 | } 518 | 519 | dd { 520 | margin: 0 0 0 170px; 521 | padding: 0 0 0.5em 0; 522 | color: white; 523 | } 524 | */ 525 | 526 | dl { 527 | background-color: #68A8C3; 528 | } 529 | dt { 530 | text-align: right; 531 | font-weight: bold; 532 | color: #3A6073; 533 | } 534 | 535 | dd { 536 | color: white; 537 | } 538 | 539 | 540 | dl { 541 | padding: 30px 0px; 542 | } 543 | 544 | dl dt { 545 | clear: left; 546 | float: left; 547 | width: 200px; 548 | margin: 0; 549 | padding: 8px 5px; 550 | /* border-top: 1px solid #999; */ 551 | font-weight: bold; 552 | } 553 | 554 | dl dd { 555 | margin-left: 210px; 556 | padding: 8px 45px 8px 25px; 557 | 558 | /* border-top: 1px solid #999; */ 559 | } 560 | 561 | 562 | dd a { 563 | color: white; 564 | text-decoration: underline; 565 | } 566 | 567 | dt a { 568 | color: #3A6073; 569 | text-decoration: underline; 570 | } 571 | 572 | .post .img-container { 573 | background-color: white; 574 | width: 100%; 575 | padding: 30px 0px; 576 | border-top: 1px solid #EDEDED; 577 | border-bottom: 1px solid #EDEDED; 578 | } 579 | 580 | #main img { 581 | margin: auto; 582 | display: block; 583 | max-width: 600px; 584 | } 585 | 586 | .inline { 587 | margin: 0; 588 | display: inline; 589 | } 590 | 591 | .post table { 592 | margin: 1.5em auto; 593 | width: 600px; 594 | } 595 | 596 | .footnote { 597 | color: #bbb; 598 | font-size: smaller; 599 | } 600 | 601 | .footnote a { 602 | color: #888; 603 | } 604 | 605 | .fb-like { 606 | float: left; 607 | padding: 0px 8px 8px 0px; 608 | } 609 | -------------------------------------------------------------------------------- /output/assets/sunburst.css: -------------------------------------------------------------------------------- 1 | /* Pretty printing styles. Used with prettify.js. */ 2 | /* Vim sunburst theme by David Leibovic */ 3 | 4 | pre .str, code .str { color: #65B042; } /* string - green */ 5 | pre .kwd, code .kwd { color: #E28964; } /* keyword - dark pink */ 6 | pre .com, code .com { color: #AEAEAE; font-style: italic; } /* comment - gray */ 7 | pre .typ, code .typ { color: #89bdff; } /* type - light blue */ 8 | pre .lit, code .lit { color: #3387CC; } /* literal - blue */ 9 | pre .pun, code .pun { color: #fff; } /* punctuation - white */ 10 | pre .pln, code .pln { color: #fff; } /* plaintext - white */ 11 | pre .tag, code .tag { color: #89bdff; } /* html/xml tag - light blue */ 12 | pre .atn, code .atn { color: #bdb76b; } /* html/xml attribute name - khaki */ 13 | pre .atv, code .atv { color: #65B042; } /* html/xml attribute value - green */ 14 | pre .dec, code .dec { color: #3387CC; } /* decimal - blue */ 15 | 16 | pre.prettyprint, code.prettyprint { 17 | background-color: #000; 18 | -moz-border-radius: 8px; 19 | -webkit-border-radius: 8px; 20 | -o-border-radius: 8px; 21 | -ms-border-radius: 8px; 22 | -khtml-border-radius: 8px; 23 | border-radius: 8px; 24 | } 25 | 26 | pre.prettyprint { 27 | width: 95%; 28 | margin: 1em auto; 29 | padding: 1em; 30 | white-space: pre-wrap; 31 | 32 | font-size: 14px; 33 | } 34 | 35 | 36 | /* Specify class=linenums on a pre to get line numbering */ 37 | ol.linenums { margin-top: 0; margin-bottom: 0; color: #AEAEAE; } /* IE indents via margin-left */ 38 | li.L0,li.L1,li.L2,li.L3,li.L5,li.L6,li.L7,li.L8 { list-style-type: none } 39 | /* Alternate shading for lines */ 40 | li.L1,li.L3,li.L5,li.L7,li.L9 { } 41 | 42 | @media print { 43 | pre .str, code .str { color: #060; } 44 | pre .kwd, code .kwd { color: #006; font-weight: bold; } 45 | pre .com, code .com { color: #600; font-style: italic; } 46 | pre .typ, code .typ { color: #404; font-weight: bold; } 47 | pre .lit, code .lit { color: #044; } 48 | pre .pun, code .pun { color: #440; } 49 | pre .pln, code .pln { color: #000; } 50 | pre .tag, code .tag { color: #006; font-weight: bold; } 51 | pre .atn, code .atn { color: #404; } 52 | pre .atv, code .atv { color: #060; } 53 | } 54 | -------------------------------------------------------------------------------- /output/images/CAP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/CAP.png -------------------------------------------------------------------------------- /output/images/CAP_choice.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/CAP_choice.png -------------------------------------------------------------------------------- /output/images/barroso_holzle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/barroso_holzle.png -------------------------------------------------------------------------------- /output/images/chandra_failure_detectors.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/chandra_failure_detectors.png -------------------------------------------------------------------------------- /output/images/dist-sys-cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/dist-sys-cover.png -------------------------------------------------------------------------------- /output/images/epoch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/epoch.png -------------------------------------------------------------------------------- /output/images/format_epub.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/format_epub.png -------------------------------------------------------------------------------- /output/images/format_html.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/format_html.png -------------------------------------------------------------------------------- /output/images/format_mobi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/format_mobi.png -------------------------------------------------------------------------------- /output/images/format_pdf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/format_pdf.png -------------------------------------------------------------------------------- /output/images/git-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/git-icon.png -------------------------------------------------------------------------------- /output/images/global-clock.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/global-clock.png -------------------------------------------------------------------------------- /output/images/google-transact09.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/google-transact09.png -------------------------------------------------------------------------------- /output/images/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/image.png -------------------------------------------------------------------------------- /output/images/local-clock.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/local-clock.png -------------------------------------------------------------------------------- /output/images/news_120.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/news_120.jpg -------------------------------------------------------------------------------- /output/images/oltp_overhead.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/oltp_overhead.png -------------------------------------------------------------------------------- /output/images/part-repl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/part-repl.png -------------------------------------------------------------------------------- /output/images/pbs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/pbs.png -------------------------------------------------------------------------------- /output/images/replication-async.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/replication-async.png -------------------------------------------------------------------------------- /output/images/replication-both.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/replication-both.png -------------------------------------------------------------------------------- /output/images/replication-sync.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/replication-sync.png -------------------------------------------------------------------------------- /output/images/replication.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/replication.pptx -------------------------------------------------------------------------------- /output/images/statediagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/statediagram.png -------------------------------------------------------------------------------- /output/images/system-model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/system-model.png -------------------------------------------------------------------------------- /output/images/system-of-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/system-of-2.png -------------------------------------------------------------------------------- /output/images/system-of-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/system-of-3.png -------------------------------------------------------------------------------- /output/images/vector_clock.svg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/images/vector_clock.svg.png -------------------------------------------------------------------------------- /output/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Distributed systems for fun and profit 5 | 6 | 14 | 15 | 16 | 17 | 18 | 27 | 28 | 29 |
      30 |
      31 |
      32 |

      Distributed systems

      33 |

      for fun and profit

      34 |
      35 | 42 |
      43 | 44 | 45 |
      46 |
      47 |
      48 | 69 |
      70 |
      71 |
      72 | 73 | 78 | 79 |
      80 |
      81 |
      82 | 83 | 84 | 88 |
      89 |
      90 |
      91 | 92 | 93 | 94 |
      95 |
      96 |

      Introduction

      97 |

      I wanted a text that would bring together the ideas behind many of the more recent distributed systems - systems such as Amazon's Dynamo, Google's BigTable and MapReduce, Apache's Hadoop and so on.

      98 |

      In this text I've tried to provide a more accessible introduction to distributed systems. To me, that means two things: introducing the key concepts that you will need in order to have a good time reading more serious texts, and providing a narrative that covers things in enough detail that you get a gist of what's going on without getting stuck on details. It's 2013, you've got the Internet, and you can selectively read more about the topics you find most interesting.

      99 |

      In my view, much of distributed programming is about dealing with the implications of two consequences of distribution:

      100 |
        101 |
      • that information travels at the speed of light
      • 102 |
      • that independent things fail independently*
      • 103 |
      104 |

      In other words, that the core of distributed programming is dealing with distance (duh!) and having more than one thing (duh!). These constraints define a space of possible system designs, and my hope is that after reading this you'll have a better sense of how distance, time and consistency models interact.

      105 |

      This text is focused on distributed programming and systems concepts you'll need to understand commercial systems in the data center. It would be madness to attempt to cover everything. You'll learn many key protocols and algorithms (covering, for example, many of the most cited papers in the discipline), including some new exciting ways to look at eventual consistency that haven't still made it into college textbooks - such as CRDTs and the CALM theorem.

      106 |

      I hope you like it! If you want to say thanks, follow me on Github (or Twitter). And if you spot an error, file a pull request on Github.

      107 |
      108 |

      1. Basics

      109 |

      The first chapter covers distributed systems at a high level by introducing a number of important terms and concepts. It covers high level goals, such as scalability, availability, performance, latency and fault tolerance; how those are hard to achieve, and how abstractions and models as well as partitioning and replication come into play.

      110 |

      2. Up and down the level of abstraction

      111 |

      The second chapter dives deeper into abstractions and impossibility results. It starts with a Nietzsche quote, and then introduces system models and the many assumptions that are made in a typical system model. It then discusses the CAP theorem and summarizes the FLP impossibility result. It then turns to the implications of the CAP theorem, one of which is that one ought to explore other consistency models. A number of consistency models are then discussed.

      112 |

      3. Time and order

      113 |

      A big part of understanding distributed systems is about understanding time and order. To the extent that we fail to understand and model time, our systems will fail. The third chapter discusses time and order, and clocks as well as the various uses of time, order and clocks (such as vector clocks and failure detectors).

      114 |

      4. Replication: preventing divergence

      115 |

      The fourth chapter introduces the replication problem, and the two basic ways in which it can be performed. It turns out that most of the relevant characteristics can be discussed with just this simple characterization. Then, replication methods for maintaining single-copy consistency are discussed from the least fault tolerant (2PC) to Paxos.

      116 |

      5. Replication: accepting divergence

      117 |

      The fifth chapter discussed replication with weak consistency guarantees. It introduces a basic reconciliation scenario, where partitioned replicas attempt to reach agreement. It then discusses Amazon's Dynamo as an example of a system design with weak consistency guarantees. Finally, two perspectives on disorderly programming are discussed: CRDTs and the CALM theorem.

      118 |

      6. Appendix

      119 |

      The appendix covers recommendations for further reading.

      120 |
      121 |

      *: This is a lie. This post by Jay Kreps elaborates. 122 |

      123 | 124 |
      125 | 126 | 127 |
      128 | 129 | 130 |
      131 |
      132 |
      133 | 134 | 137 | 138 |
      139 |
      140 |
      141 | 142 | 145 | 146 |
      147 |
      148 |
      149 | 150 | 168 | 169 | 170 | 171 |
      172 |
      173 | 174 | 175 | 176 | 177 | 178 | -------------------------------------------------------------------------------- /output/intro.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Distributed systems for fun and profit 5 | 6 | 14 | 15 | 16 | 17 | 18 | 27 | 28 | 29 |
      30 |
      31 |
      32 |

      Distributed systems

      33 |

      for fun and profit

      34 |
      35 | 42 |
      43 | 44 | 45 |
      46 |
      47 |
      48 | 49 | 50 | 54 |
      55 |
      56 |
      57 | 58 | 59 | 60 |
      61 |
      62 |

      1. Distributed systems at a high level

      63 |
      64 |

      Distributed programming is the art of solving the same problem that you can solve on a single computer using multiple computers.

      65 |
      66 |

      There are two basic tasks that any computer system needs to accomplish:

      67 |
        68 |
      • storage and
      • 69 |
      • computation
      • 70 |
      71 |

      Distributed programming is the art of solving the same problem that you can solve on a single computer using multiple computers - usually, because the problem no longer fits on a single computer.

      72 |

      Nothing really demands that you use distributed systems. Given infinite money and infinite R&D time, we wouldn't need distributed systems. All computation and storage could be done on a magic box - a single, incredibly fast and incredibly reliable system that you pay someone else to design for you.

      73 |

      However, few people have infinite resources. Hence, they have to find the right place on some real-world cost-benefit curve. At a small scale, upgrading hardware is a viable strategy. However, as problem sizes increase you will reach a point where either the hardware upgrade that allows you to solve the problem on a single node does not exist, or becomes cost-prohibitive. At that point, I welcome you to the world of distributed systems.

      74 |

      It is a current reality that the best value is in mid-range, commodity hardware - as long as the maintenance costs can be kept down through fault-tolerant software.

      75 |

      Computations primarily benefit from high-end hardware to the extent to which they can replace slow network accesses with internal memory accesses. The performance advantage of high-end hardware is limited in tasks that require large amounts of communication between nodes.

      76 |

      cost-efficiency

      77 |

      As the figure above from Barroso, Clidaras & Hölzle shows, the performance gap between high-end and commodity hardware decreases with cluster size assuming a uniform memory access pattern across all nodes.

      78 |

      Ideally, adding a new machine would increase the performance and capacity of the system linearly. But of course this is not possible, because there is some overhead that arises due to having separate computers. Data needs to be copied around, computation tasks have to be coordinated and so on. This is why it's worthwhile to study distributed algorithms - they provide efficient solutions to specific problems, as well as guidance about what is possible, what the minimum cost of a correct implementation is, and what is impossible.

      79 |

      The focus of this text is on distributed programming and systems in a mundane, but commercially relevant setting: the data center. For example, I will not discuss specialized problems that arise from having an exotic network configuration, or that arise in a shared-memory setting. Additionally, the focus is on exploring the system design space rather than on optimizing any specific design - the latter is a topic for a much more specialized text.

      80 |

      What we want to achieve: Scalability and other good things

      81 |

      The way I see it, everything starts with the need to deal with size.

      82 |

      Most things are trivial at a small scale - and the same problem becomes much harder once you surpass a certain size, volume or other physically constrained thing. It's easy to lift a piece of chocolate, it's hard to lift a mountain. It's easy to count how many people are in a room, and hard to count how many people are in a country.

      83 |

      So everything starts with size - scalability. Informally speaking, in a scalable system as we move from small to large, things should not get incrementally worse. Here's another definition:

      84 |
      85 |
      Scalability
      86 |
      is the ability of a system, network, or process, to handle a growing amount of work in a capable manner or its ability to be enlarged to accommodate that growth.
      87 |
      88 | 89 |

      What is it that is growing? Well, you can measure growth in almost any terms (number of people, electricity usage etc.). But there are three particularly interesting things to look at:

      90 |
        91 |
      • Size scalability: adding more nodes should make the system linearly faster; growing the dataset should not increase latency
      • 92 |
      • Geographic scalability: it should be possible to use multiple data centers to reduce the time it takes to respond to user queries, while dealing with cross-data center latency in some sensible manner.
      • 93 |
      • Administrative scalability: adding more nodes should not increase the administrative costs of the system (e.g. the administrators-to-machines ratio).
      • 94 |
      95 |

      Of course, in a real system growth occurs on multiple different axes simultaneously; each metric captures just some aspect of growth.

      96 |

      A scalable system is one that continues to meet the needs of its users as scale increases. There are two particularly relevant aspects - performance and availability - which can be measured in various ways.

      97 |

      Performance (and latency)

      98 |
      99 |
      Performance
      100 |
      is characterized by the amount of useful work accomplished by a computer system compared to the time and resources used.
      101 |
      102 | 103 |

      Depending on the context, this may involve achieving one or more of the following:

      104 |
        105 |
      • Short response time/low latency for a given piece of work
      • 106 |
      • High throughput (rate of processing work)
      • 107 |
      • Low utilization of computing resource(s)
      • 108 |
      109 |

      There are tradeoffs involved in optimizing for any of these outcomes. For example, a system may achieve a higher throughput by processing larger batches of work thereby reducing operation overhead. The tradeoff would be longer response times for individual pieces of work due to batching.

      110 |

      I find that low latency - achieving a short response time - is the most interesting aspect of performance, because it has a strong connection with physical (rather than financial) limitations. It is harder to address latency using financial resources than the other aspects of performance.

      111 |

      There are a lot of really specific definitions for latency, but I really like the idea that the etymology of the word evokes:

      112 |
      113 |
      Latency
      114 |
      The state of being latent; delay, a period between the initiation of something and the occurrence.
      115 |
      116 | 117 |

      And what does it mean to be "latent"?

      118 |
      119 |
      Latent
      120 |
      From Latin latens, latentis, present participle of lateo ("lie hidden"). Existing or present but concealed or inactive.
      121 |
      122 | 123 |

      This definition is pretty cool, because it highlights how latency is really the time between when something happened and the time it has an impact or becomes visible.

      124 |

      For example, imagine that you are infected with an airborne virus that turns people into zombies. The latent period is the time between when you became infected, and when you turn into a zombie. That's latency: the time during which something that has already happened is concealed from view.

      125 |

      Let's assume for a moment that our distributed system does just one high-level task: given a query, it takes all of the data in the system and calculates a single result. In other words, think of a distributed system as a data store with the ability to run a single deterministic computation (function) over its current content:

      126 |

      result = query(all data in the system)

      127 |

      Then, what matters for latency is not the amount of old data, but rather the speed at which new data "takes effect" in the system. For example, latency could be measured in terms of how long it takes for a write to become visible to readers.

      128 |

      The other key point based on this definition is that if nothing happens, there is no "latent period". A system in which data doesn't change doesn't (or shouldn't) have a latency problem.

      129 |

      In a distributed system, there is a minimum latency that cannot be overcome: the speed of light limits how fast information can travel, and hardware components have a minimum latency cost incurred per operation (think RAM and hard drives but also CPUs).

      130 |

      How much that minimum latency impacts your queries depends on the nature of those queries and the physical distance the information needs to travel.

      131 |

      Availability (and fault tolerance)

      132 |

      The second aspect of a scalable system is availability.

      133 |
      134 |
      Availability
      135 |
      the proportion of time a system is in a functioning condition. If a user cannot access the system, it is said to be unavailable.
      136 |
      137 | 138 |

      Distributed systems allow us to achieve desirable characteristics that would be hard to accomplish on a single system. For example, a single machine cannot tolerate any failures since it either fails or doesn't.

      139 |

      Distributed systems can take a bunch of unreliable components, and build a reliable system on top of them.

      140 |

      Systems that have no redundancy can only be as available as their underlying components. Systems built with redundancy can be tolerant of partial failures and thus be more available. It is worth noting that "redundant" can mean different things depending on what you look at - components, servers, datacenters and so on.

      141 |

      Formulaically, availability is: Availability = uptime / (uptime + downtime).

      142 |

      Availability from a technical perspective is mostly about being fault tolerant. Because the probability of a failure occurring increases with the number of components, the system should be able to compensate so as to not become less reliable as the number of components increases.

      143 |

      For example:

      144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 |
      Availability %How much downtime is allowed per year?
      90% ("one nine")More than a month
      99% ("two nines")Less than 4 days
      99.9% ("three nines")Less than 9 hours
      99.99% ("four nines")Less than an hour
      99.999% ("five nines")~ 5 minutes
      99.9999% ("six nines")~ 31 seconds
      174 | 175 | 176 |

      Availability is in some sense a much wider concept than uptime, since the availability of a service can also be affected by, say, a network outage or the company owning the service going out of business (which would be a factor which is not really relevant to fault tolerance but would still influence the availability of the system). But without knowing every single specific aspect of the system, the best we can do is design for fault tolerance.

      177 |

      What does it mean to be fault tolerant?

      178 |
      179 |
      Fault tolerance
      180 |
      ability of a system to behave in a well-defined manner once faults occur
      181 |
      182 | 183 |

      Fault tolerance boils down to this: define what faults you expect and then design a system or an algorithm that is tolerant of them. You can't tolerate faults you haven't considered.

      184 |

      What prevents us from achieving good things?

      185 |

      Distributed systems are constrained by two physical factors:

      186 |
        187 |
      • the number of nodes (which increases with the required storage and computation capacity)
      • 188 |
      • the distance between nodes (information travels, at best, at the speed of light)
      • 189 |
      190 |

      Working within those constraints:

      191 |
        192 |
      • an increase in the number of independent nodes increases the probability of failure in a system (reducing availability and increasing administrative costs)
      • 193 |
      • an increase in the number of independent nodes may increase the need for communication between nodes (reducing performance as scale increases)
      • 194 |
      • an increase in geographic distance increases the minimum latency for communication between distant nodes (reducing performance for certain operations)
      • 195 |
      196 |

      Beyond these tendencies - which are a result of the physical constraints - is the world of system design options.

      197 |

      Both performance and availability are defined by the external guarantees the system makes. On a high level, you can think of the guarantees as the SLA (service level agreement) for the system: if I write data, how quickly can I access it elsewhere? After the data is written, what guarantees do I have of durability? If I ask the system to run a computation, how quickly will it return results? When components fail, or are taken out of operation, what impact will this have on the system?

      198 |

      There is another criterion, which is not explicitly mentioned but implied: intelligibility. How understandable are the guarantees that are made? Of course, there are no simple metrics for what is intelligible.

      199 |

      I was kind of tempted to put "intelligibility" under physical limitations. After all, it is a hardware limitation in people that we have a hard time understanding anything that involves more moving things than we have fingers. That's the difference between an error and an anomaly - an error is incorrect behavior, while an anomaly is unexpected behavior. If you were smarter, you'd expect the anomalies to occur.

      200 |

      Abstractions and models

      201 |

      This is where abstractions and models come into play. Abstractions make things more manageable by removing real-world aspects that are not relevant to solving a problem. Models describe the key properties of a distributed system in a precise manner. I'll discuss many kinds of models in the next chapter, such as:

      202 |
        203 |
      • System model (asynchronous / synchronous)
      • 204 |
      • Failure model (crash-fail, partitions, Byzantine)
      • 205 |
      • Consistency model (strong, eventual)
      • 206 |
      207 |

      A good abstraction makes working with a system easier to understand, while capturing the factors that are relevant for a particular purpose.

      208 |

      There is a tension between the reality that there are many nodes and with our desire for systems that "work like a single system". Often, the most familiar model (for example, implementing a shared memory abstraction on a distributed system) is too expensive.

      209 |

      A system that makes weaker guarantees has more freedom of action, and hence potentially greater performance - but it is also potentially hard to reason about. People are better at reasoning about systems that work like a single system, rather than a collection of nodes.

      210 |

      One can often gain performance by exposing more details about the internals of the system. For example, in columnar storage, the user can (to some extent) reason about the locality of the key-value pairs within the system and hence make decisions that influence the performance of typical queries. Systems which hide these kinds of details are easier to understand (since they act more like single unit, with fewer details to think about), while systems that expose more real-world details may be more performant (because they correspond more closely to reality).

      211 |

      Several types of failures make writing distributed systems that act like a single system difficult. Network latency and network partitions (e.g. total network failure between some nodes) mean that a system needs to sometimes make hard choices about whether it is better to stay available but lose some crucial guarantees that cannot be enforced, or to play it safe and refuse clients when these types of failures occur.

      212 |

      The CAP theorem - which I will discuss in the next chapter - captures some of these tensions. In the end, the ideal system meets both programmer needs (clean semantics) and business needs (availability/consistency/latency).

      213 |

      Design techniques: partition and replicate

      214 |

      The manner in which a data set is distributed between multiple nodes is very important. In order for any computation to happen, we need to locate the data and then act on it.

      215 |

      There are two basic techniques that can be applied to a data set. It can be split over multiple nodes (partitioning) to allow for more parallel processing. It can also be copied or cached on different nodes to reduce the distance between the client and the server and for greater fault tolerance (replication).

      216 |
      217 |

      Divide and conquer - I mean, partition and replicate.

      218 |
      219 |

      The picture below illustrates the difference between these two: partitioned data (A and B below) is divided into independent sets, while replicated data (C below) is copied to multiple locations.

      220 |

      Partition and replicate

      221 |

      This is the one-two punch for solving any problem where distributed computing plays a role. Of course, the trick is in picking the right technique for your concrete implementation; there are many algorithms that implement replication and partitioning, each with different limitations and advantages which need to be assessed against your design objectives.

      222 |

      Partitioning

      223 |

      Partitioning is dividing the dataset into smaller distinct independent sets; this is used to reduce the impact of dataset growth since each partition is a subset of the data.

      224 |
        225 |
      • Partitioning improves performance by limiting the amount of data to be examined and by locating related data in the same partition
      • 226 |
      • Partitioning improves availability by allowing partitions to fail independently, increasing the number of nodes that need to fail before availability is sacrificed
      • 227 |
      228 |

      Partitioning is also very much application-specific, so it is hard to say much about it without knowing the specifics. That's why the focus is on replication in most texts, including this one.

      229 |

      Partitioning is mostly about defining your partitions based on what you think the primary access pattern will be, and dealing with the limitations that come from having independent partitions (e.g. inefficient access across partitions, different rate of growth etc.).

      230 |

      Replication

      231 |

      Replication is making copies of the same data on multiple machines; this allows more servers to take part in the computation.

      232 |

      Let me inaccurately quote Homer J. Simpson:

      233 |
      234 |

      To replication! The cause of, and solution to all of life's problems.

      235 |
      236 |

      Replication - copying or reproducing something - is the primary way in which we can fight latency.

      237 |
        238 |
      • Replication improves performance by making additional computing power and bandwidth applicable to a new copy of the data
      • 239 |
      • Replication improves availability by creating additional copies of the data, increasing the number of nodes that need to fail before availability is sacrificed
      • 240 |
      241 |

      Replication is about providing extra bandwidth, and caching where it counts. It is also about maintaining consistency in some way according to some consistency model.

      242 |

      Replication allows us to achieve scalability, performance and fault tolerance. Afraid of loss of availability or reduced performance? Replicate the data to avoid a bottleneck or single point of failure. Slow computation? Replicate the computation on multiple systems. Slow I/O? Replicate the data to a local cache to reduce latency or onto multiple machines to increase throughput.

      243 |

      Replication is also the source of many of the problems, since there are now independent copies of the data that has to be kept in sync on multiple machines - this means ensuring that the replication follows a consistency model.

      244 |

      The choice of a consistency model is crucial: a good consistency model provides clean semantics for programmers (in other words, the properties it guarantees are easy to reason about) and meets business/design goals such as high availability or strong consistency.

      245 |

      Only one consistency model for replication - strong consistency - allows you to program as-if the underlying data was not replicated. Other consistency models expose some internals of the replication to the programmer. However, weaker consistency models can provide lower latency and higher availability - and are not necessarily harder to understand, just different.

      246 |
      247 |

      Further reading

      248 | 253 | 254 |
      255 | 256 | 257 |
      258 | 259 | 260 |
      261 |
      262 |
      263 | 264 | 267 | 268 |
      269 |
      270 |
      271 | 272 | 275 | 276 |
      277 |
      278 |
      279 | 280 | 298 | 299 | 300 | 301 |
      302 | 303 | 304 | 305 | 306 | 307 | 308 | -------------------------------------------------------------------------------- /output/mixu-distributed-systems-book.epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/mixu-distributed-systems-book.epub -------------------------------------------------------------------------------- /output/mixu-distributed-systems-book.mobi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mixu/distsysbook/4c86e54b1e601790c7d5fcbb34da6bf1188740d8/output/mixu-distributed-systems-book.mobi -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "book", 3 | "description": "Book about distributed systems", 4 | "version": "1.0.0", 5 | "author": { 6 | "name": "Mikito Takada", 7 | "email": "mixu@mixu.net", 8 | "url": "http://blog.mixu.net/" 9 | }, 10 | "main": "bookgen/generate.js", 11 | "dependencies": { 12 | "marked": "~0.2.5" 13 | }, 14 | "private": "true", 15 | "scripts": { 16 | "start": "node bookgen/generate.js" 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Distributed systems: for fun and profit 2 | 3 | Read the book here: http://book.mixu.net/distsys/ 4 | 5 | # Notes 6 | 7 | I've added this repository to make it easier to work with the book, if you feel like it. 8 | 9 | Writing a patch should be easy: edit the markdown files under `./input/` and file a pull request; I'll handle the rest. 10 | 11 | I didn't include the commits from prior to the release, because writing is a messy and painful process of revisions, deletions and rethinking things. 12 | 13 | ```shell 14 | git shortlog -sn 15 | ``` 16 | 17 | tells me that I made 205 commits between October 2012 (1st commit was in September) and September 16th 2013 to write this. 18 | 19 | # Directory structure 20 | 21 | The content of the book is in `./input/`. To generate the book: 22 | 23 | ```shell 24 | npm install 25 | make build 26 | ``` 27 | 28 | which generates the output in `./output/`. 29 | 30 | To rebuild the .epub and .mobi files: 31 | 32 | ```shell 33 | make ebook 34 | ``` 35 | 36 | You need to install Calibre first for the HTML to epub/mobi conversion. 37 | 38 | # Thanks 39 | 40 | Many many thanks to: logpath, alexras, globalcitizen, graue, frankshearar, roryokane, jpfuentes2, cmeiklejohn, stevenproctor, eos2102 and steveloughran for their help! 41 | 42 | # Licence 43 | 44 | This book is available for free, but what I've written remains mine. 45 | 46 | Translations: as long as the result made is available for free (you can have ads) I welcome translations. 47 | 48 | Other use: contact me; as long as your intentions are good I'd be happy to figure out something. I'm not looking to make money from the book but I don't want it to be republished without my permission. 49 | 50 | # Icons 51 | 52 | Some icons by Yusuke Kamiyamane. Licensed under a Creative Commons Attribution 3.0 License. 53 | 54 | Git Logo by Jason Long is licensed under the Creative Commons Attribution 3.0 Unported License. 55 | --------------------------------------------------------------------------------