├── .htaccess
├── .vimrc
├── Makefile
├── README.html
├── README.md
├── code
    ├── ivy-code.txt
    └── l-rpc.go
├── echo-disclaimer.sh
├── exams
    ├── exams.html
    ├── exams.md
    ├── pdfs
    │   ├── q02-1-ans.pdf
    │   ├── q04-1-ans.pdf
    │   ├── q04-2-ans.pdf
    │   ├── q05-1-ans.pdf
    │   ├── q05-2-ans.pdf
    │   ├── q06-1.pdf
    │   ├── q07-1-ans.pdf
    │   ├── q07-2-ans.pdf
    │   ├── q09-1-ans.pdf
    │   ├── q09-2-ans.pdf
    │   ├── q10-1-ans.pdf
    │   ├── q10-2-ans.pdf
    │   ├── q11-1-ans.pdf
    │   ├── q11-2-ans.pdf
    │   ├── q12-1-ans.pdf
    │   ├── q12-2-ans.pdf
    │   ├── q13-1-ans.pdf
    │   ├── q13-2-ans.pdf
    │   ├── q14-1-ans.pdf
    │   ├── q14-2-ans.pdf
    │   └── q6-02.pdf
    ├── quiz1
    │   ├── qs
    │   │   └── q1-2014
    │   │   │   ├── paxos.png
    │   │   │   ├── q14-1-1.png
    │   │   │   ├── q14-2-2.png
    │   │   │   ├── q14-3-3.pdf
    │   │   │   ├── q14-4-4.png
    │   │   │   ├── q14-4-5.png
    │   │   │   ├── q14-5-6.png
    │   │   │   ├── q14-5-7.png
    │   │   │   ├── q14-5-8.png
    │   │   │   ├── q14-6-9.png
    │   │   │   ├── q14-7-10.png
    │   │   │   ├── q14-7-10a.png
    │   │   │   ├── q14-7-10b.png
    │   │   │   ├── q14-7-10c.png
    │   │   │   ├── q14-7-10d.png
    │   │   │   └── q14-7-10e.png
    │   ├── quiz1.html
    │   └── quiz1.md
    ├── quiz2
    │   ├── quiz2.html
    │   └── quiz2.md
    └── raft.png
├── extra
    ├── .vimrc
    ├── bayou.ppt
    ├── pbft.html
    └── pbft.md
├── index.html
├── index.md
├── l01-intro.html
├── l01-intro.md
├── l02-rpc.html
├── l02-rpc.md
├── l03-fault-tolerance.html
├── l03-fault-tolerance.md
├── l04-more-primary-backup.html
├── l04-more-primary-backup.md
├── l05-paxos.html
├── l05-paxos.md
├── l06-raft.html
├── l06-raft.md
├── l07-go.html
├── l07-go.md
├── l08-harp.html
├── l08-harp.md
├── l09-dist-comp-seq-consistency.html
├── l09-dist-comp-seq-consistency.md
├── l10-treadmarks.html
├── l10-treadmarks.md
├── l11-ficus.html
├── l11-ficus.md
├── l12-bayou.html
├── l12-bayou.md
├── l13-mapreduce.html
├── l13-mapreduce.md
├── l14-spark.html
├── l14-spark.md
├── l15-spanner.html
├── l15-spanner.md
├── l16-memcached.html
├── l16-memcached.md
├── l17-pnuts.html
├── l17-pnuts.md
├── l18-dynamo.html
├── l18-dynamo.md
├── l19-hubspot.html
├── l19-hubspot.md
├── l20-argus.html
├── l20-argus.md
├── l21-thor.html
├── l21-thor.md
├── l22-peer-to-peer.html
├── l22-peer-to-peer.md
├── l23-bitcoin.html
├── l23-bitcoin.md
├── lab1
    └── index.html
├── lab2
    ├── index.html
    ├── lab-2a-vs.png
    ├── notes.html
    └── notes.md
├── lab3
    ├── index.html
    ├── notes.html
    └── notes.md
├── lab4
    ├── index.html
    ├── notes.html
    └── notes.md
├── lab5
    └── index.html
├── original-notes
    ├── l01-intro.txt
    ├── l02-rpc.txt
    ├── l03-remus.txt
    ├── l04-fds.txt
    ├── l05-paxos.txt
    ├── l06-raft.txt
    ├── l08-harp.txt
    ├── l09-ivy.txt
    ├── l10-treadmarks.txt
    ├── l11-ficus.txt
    ├── l12-bayou.txt
    ├── l13-mapreduce.txt
    ├── l14-spark.txt
    ├── l15-spanner.txt
    ├── l16-memcached.txt
    ├── l17-pnuts.txt
    ├── l18-dynamo.txt
    ├── l20-argus.txt
    ├── l21-thor.txt
    ├── l22-dht.txt
    ├── l23-bitcoin.txt
    ├── pbft-2001.txt
    ├── pbft-2009.txt
    ├── pbft-2010.txt
    ├── pbft-2011.txt
    ├── pbft-2012.txt
    └── pbft.ppt
├── papers
    ├── .htaccess
    ├── akamai.pdf
    ├── argus88.pdf
    ├── bayou-conflicts.pdf
    ├── bitcoin.pdf
    ├── bliskov-harp.pdf
    ├── cooper-pnuts.pdf
    ├── dht-9-per-page.pdf
    ├── dht.pdf
    ├── dynamo.pdf
    ├── fds.pdf
    ├── ficus.pdf
    ├── flp.pdf
    ├── guardians-and-actions-liskov.pdf
    ├── kademlia.pdf
    ├── katabi-analogicfs.pdf
    ├── keleher-treadmarks.pdf
    ├── li-dsm.pdf
    ├── mapreduce.pdf
    ├── memcache-fb.pdf
    ├── paxos-simple.pdf
    ├── pbft.pdf
    ├── raft-atc14.pdf
    ├── remus.pdf
    ├── spanner.pdf
    └── zaharia-spark.pdf
├── paxos-algorithm.html
├── stumbled
    ├── flp-consensus.pdf
    └── paxos-explained-from-scratch.pdf
├── template.html
└── template.md


/.htaccess:
--------------------------------------------------------------------------------
 1 | # Protect the htaccess file
 2 | <Files .htaccess>
 3 | Order Allow,Deny
 4 | Deny from all
 5 | </Files>
 6 | 
 7 | # Protect .git/
 8 | <Files .git>
 9 | Order Allow,Deny
10 | Deny from all
11 | </Files>
12 | 
13 | <Files .gitmodules>
14 | Order Allow,Deny
15 | Deny from all
16 | </Files>
17 | 
18 | <Files .gitignore>
19 | Order Allow,Deny
20 | Deny from all
21 | </Files>
22 | 
23 | <Files README.md>
24 | Order Allow,Deny
25 | Deny from all
26 | </Files>
27 | 
28 | <Files README.html>
29 | Order Allow,Deny
30 | Deny from all
31 | </Files>
32 | 
33 | # Disable directory browsing
34 | Options All -Indexes
35 | 


--------------------------------------------------------------------------------
/.vimrc:
--------------------------------------------------------------------------------
 1 | ":echo g:os
 2 | 
 3 | if g:os == "Darwin"
 4 |     let flags=' --quiet -f markdown+smart'
 5 | else
 6 |     let flags=' --smart -f markdown'
 7 | endif
 8 | ":echo flags
 9 | 
10 | :autocmd BufWritePost *.md
11 | \   silent execute '!pandoc --standalone' . flags . ' --mathjax -t html "<afile>" >"'.
12 | \   expand('<afile>:t:r').'".html'
13 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | SRCS=$(wildcard *.md)
 2 | 
 3 | HTMLS=$(SRCS:.md=.html)
 4 | 
 5 | %.html: %.md
 6 | 	@echo "Compiling $< -> $*.html"
 7 | 	markdown $< >$*.html
 8 | 
 9 | all: $(HTMLS)
10 | 	@echo "HTMLs: $(HTMLS)"
11 | 	@echo "MDs: $(SRCS)"
12 | 


--------------------------------------------------------------------------------
/README.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
  3 | <head>
  4 |   <meta charset="utf-8" />
  5 |   <meta name="generator" content="pandoc" />
  6 |   <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes" />
  7 |   <title>index</title>
  8 |   <style type="text/css">
  9 |       code{white-space: pre-wrap;}
 10 |       span.smallcaps{font-variant: small-caps;}
 11 |       div.line-block{white-space: pre-line;}
 12 |       div.column{display: inline-block; vertical-align: top; width: 50%;}
 13 |   </style>
 14 |   <!--[if lt IE 9]>
 15 |     <script src="//cdnjs.cloudflare.com/ajax/libs/html5shiv/3.7.3/html5shiv-printshiv.min.js"></script>
 16 |   <![endif]-->
 17 | </head>
 18 | <body>
 19 | <h1 id="distributed-systems-engineering-notes-6.824-spring-2015">Distributed Systems Engineering notes (6.824, Spring 2015)</h1>
 20 | <h2 id="lectures">Lectures</h2>
 21 | <p>Lecture notes from 6.824, taught by <a href="http://pdos.csail.mit.edu/rtm/">Prof. Robert T. Morris</a>. These lecture notes are slightly modified from the ones posted on the 6.824 <a href="http://nil.csail.mit.edu/6.824/2015/schedule.html">course website</a>.</p>
 22 | <ul>
 23 | <li>Lecture 1: <a href="l01-intro.html">Introduction</a>: distributed system definition, motivations, architecture, implementation, performance, fault-tolerance, consistency, MapReduce</li>
 24 | <li>Lecture 2: <a href="l02-rpc.html">Remote Procedure Calls (RPCs)</a>: RPC overview, marshalling, binding, threads, “at-least-once”, “at-most-once”, “exactly once”, Go’s RPC, thread synchronization</li>
 25 | <li>Lecture 3: <a href="l03-fault-tolerance.html">Fault tolerance</a>: primary-backup replication, state transfer, “split-brain”, Remus (NSDI 2008),<br />
 26 | </li>
 27 | <li>Lecture 4: <a href="l04-more-primary-backup.html">Flat datacenter storage</a>: flat datacenter storage, bisection bandwidth, striping</li>
 28 | <li>Lecture 5: <a href="l05-paxos.html">Paxos</a>: Paxos, consensus algorithms
 29 | <ul>
 30 | <li><a href="paxos-algorithm.html">Paxos algorithm description</a></li>
 31 | </ul></li>
 32 | <li>Lecture 6: <a href="l06-raft.html">Raft</a>: Raft, a more understandable consensus algorithm</li>
 33 | <li>Lecture 7: <strong>Google Go</strong> <a href="l07-go.html"><em>guest lecture</em></a> by Russ Cox</li>
 34 | <li>Lecture 8: <a href="l08-harp.html">Harp</a>: distributed file system, “the UPS trick”, witnesses</li>
 35 | <li>Lecture 9: <a href="l09-dist-comp-seq-consistency.html">IVY</a>: distributed shared memory, sequential consistency</li>
 36 | <li>Lecture 10: <a href="l10-treadmarks.html">TreadMarks</a>: userspace distributed shared memory system, vector timestamps, release consistency (lazy/eager), false sharing, write amplification</li>
 37 | <li>Lecture 11: <a href="l11-ficus.html">Ficus</a>: optimistic concurrency control, vector timestamps, conflict resolution</li>
 38 | <li>Lecture 12: <a href="l12-bayou.html">Bayou</a>: disconnected operation, eventual consistency, Bayou</li>
 39 | <li>Lecture 13: <a href="l13-mapreduce.html">MapReduce</a>: MapReduce, scalability, performance</li>
 40 | <li>Lecture 14: <strong>Spark</strong> <a href="l14-spark.html"><em>guest lecture</em></a> by Matei Zaharia: Resilient Distributed Datasets, Spark</li>
 41 | <li>Lecture 15: <strong>Spanner</strong> <a href="l15-spanner.html"><em>guest lecture</em></a> by Wilson Hsieh, Google: Spanner, distributed database, clock skew</li>
 42 | <li>Lecture 16: <a href="l16-memcached.html">Memcache at Facebook</a>: web app scalability, look-aside caches, Memcache</li>
 43 | <li>Lecture 17: <a href="l17-pnuts.html">PNUTS Yahoo!</a>: distributed key-value store, atomic writes</li>
 44 | <li>Lecture 18: <a href="l18-dynamo.html">Dynamo</a>: distributed key-value store, eventual consistency</li>
 45 | <li>Lecture 19: <strong>HubSpot</strong> <a href="l19-hubspot.html"><em>guest lecture</em></a></li>
 46 | <li>Lecture 20: <a href="l20-argus.html">Two phase commit (2PC)</a>: two-phase commit, Argus</li>
 47 | <li>Lecture 21: <a href="l21-thor.html">Optimistic concurrency control</a></li>
 48 | <li>Lecture 22: <a href="l22-peer-to-peer.html">Peer-to-peer, trackerless Bittorrent and DHTs</a>: Chord, routing</li>
 49 | <li>Lecture 23: <a href="l23-bitcoin.html">Bitcoin</a>: verifiable public ledgers, proof-of-work, double spending</li>
 50 | </ul>
 51 | <h2 id="lectures-form-other-years">Lectures form other years</h2>
 52 | <ul>
 53 | <li><a href="extra/pbft.html">Practical Byzantine Fault Tolerance (PBFT)</a>
 54 | <ul>
 55 | <li>Other years: <a href="original-notes/pbft-2012.txt">[2012]</a>, <a href="original-notes/pbft-2011.txt">[2011]</a>, <a href="original-notes/pbft-2010.txt">[2010]</a>, <a href="original-notes/pbft-2009.txt">[2009]</a>, <a href="original-notes/pbft-2001.txt">[2001]</a>, <a href="original-notes/pbft.ppt">[PPT]</a></li>
 56 | </ul></li>
 57 | </ul>
 58 | <h2 id="labs">Labs</h2>
 59 | <ul>
 60 | <li>Lab 1: MapReduce, <a href="lab1/index.html">[assign]</a></li>
 61 | <li>Lab 2: A fault-tolerant key/value service, <a href="lab2/index.html">[assign]</a>, <a href="lab2/notes.html">[notes]</a></li>
 62 | <li>Lab 3: Paxos-based Key/Value Service, <a href="lab3/index.html">[assign]</a>, <a href="lab3/notes.html">[notes]</a></li>
 63 | <li>Lab 4: Sharded Key/Value Service, <a href="lab4/index.html">[assign]</a>, <a href="lab4/notes.html">[notes]</a></li>
 64 | <li>Lab 5: Persistent Key/Value Service, <a href="lab5/index.html">[assign]</a></li>
 65 | </ul>
 66 | <h2 id="papers">Papers</h2>
 67 | <p>Papers we read in 6.824 (<a href="papers/">directory here</a>):</p>
 68 | <ol type="1">
 69 | <li><a href="papers/mapreduce.pdf">MapReduce</a></li>
 70 | <li><a href="papers/remus.pdf">Remus</a></li>
 71 | <li><a href="papers/fds.pdf">Flat datacenter storage</a></li>
 72 | <li><a href="papers/paxos-simple.pdf">Paxos</a></li>
 73 | <li><a href="papers/raft-atc14.pdf">Raft</a></li>
 74 | <li><a href="papers/bliskov-harp.pdf">Harp</a></li>
 75 | <li><a href="papers/li-dsm.pdf">Shared virtual memory</a></li>
 76 | <li><a href="papers/keleher-treadmarks.pdf">TreadMarks</a></li>
 77 | <li><a href="papers/ficus.pdf">Ficus</a></li>
 78 | <li><a href="papers/bayou-conflicts.pdf">Bayou</a></li>
 79 | <li><a href="papers/zaharia-spark.pdf">Spark</a></li>
 80 | <li><a href="papers/spanner.pdf">Spanner</a></li>
 81 | <li><a href="papers/memcache-fb.pdf">Memcached at Facebook</a></li>
 82 | <li><a href="papers/cooper-pnuts.pdf">PNUTS</a></li>
 83 | <li><a href="papers/dynamo.pdf">Dynamo</a></li>
 84 | <li><a href="papers/akamai.pdf">Akamai</a></li>
 85 | <li><a href="papers/argus88.pdf">Argus</a>, <a href="papers/guardians-and-actions-liskov.pdf">Guardians and actions</a></li>
 86 | <li><a href="papers/kademlia.pdf">Kademlia</a></li>
 87 | <li><a href="papers/bitcoin.pdf">Bitcoin</a></li>
 88 | <li><a href="papers/katabi-analogicfs.pdf">AnalogicFS</a></li>
 89 | </ol>
 90 | <p>Other papers:</p>
 91 | <ol type="1">
 92 | <li><a href="papers/flp.pdf">Impossibility of Distributed Consensus with One Faulty Process</a>
 93 | <ul>
 94 | <li>See page 5, slide 10 <a href="stumbled/flp-consensus.pdf">here</a> to understand Lemma 1 (commutativity) faster</li>
 95 | <li>See <a href="http://the-paper-trail.org/blog/a-brief-tour-of-flp-impossibility/">this article here</a> for an alternative explanation.</li>
 96 | </ul></li>
 97 | <li><a href="papers/pbft.pdf">Practical Byzantine Fault Tolerance (PBFT)</a>
 98 | <ul>
 99 | <li>See <a href="http://the-paper-trail.org/blog/barbara-liskovs-turing-award-and-byzantine-fault-tolerance/#more-211">discussion here on PBFT</a>.</li>
100 | </ul></li>
101 | </ol>
102 | <h2 id="stumbled-upon">Stumbled upon</h2>
103 | <ol type="1">
104 | <li><a href="http://betathoughts.blogspot.com/2007/06/brief-history-of-consensus-2pc-and.html">A brief history of consensus, 2PC and transaction commit</a></li>
105 | <li><a href="http://the-paper-trail.org/blog/distributed-systems-theory-for-the-distributed-systems-engineer/">Distributed systems theory for the distributed systems engineer</a></li>
106 | <li><a href="http://book.mixu.net/distsys/">Distributed Systems: For fun and Profit</a></li>
107 | <li><a href="https://codahale.com/you-cant-sacrifice-partition-tolerance/">You can’t choose CA out of CAP</a>, or “You can’t sacrifice partition tolerance”</li>
108 | <li><a href="https://www.somethingsimilar.com/2013/01/14/notes-on-distributed-systems-for-young-bloods/">Notes on distributed systems for young bloods</a></li>
109 | <li><a href="stumbled/paxos-explained-from-scratch.pdf">Paxos Explained From Scratch</a></li>
110 | </ol>
111 | <h2 id="quizzes">Quizzes</h2>
112 | <p>Prep for quiz 1 <a href="exams/quiz1/quiz1.html">here</a></p>
113 | </body>
114 | </html>
115 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Distributed Systems Engineering notes (6.824, Spring 2015)
 2 | ==========================================================
 3 | 
 4 | Lectures
 5 | --------
 6 | 
 7 | Lecture notes from 6.824, taught by [Prof. Robert T. Morris](http://pdos.csail.mit.edu/rtm/). These lecture notes are slightly modified from the ones posted on the 6.824 [course website](http://nil.csail.mit.edu/6.824/2015/schedule.html).
 8 | 
 9 |  * Lecture 1: [Introduction](l01-intro.html): distributed system definition, motivations, architecture, implementation, performance, fault-tolerance, consistency, MapReduce 
10 |  * Lecture 2: [Remote Procedure Calls (RPCs)](l02-rpc.html): RPC overview, marshalling, binding, threads, "at-least-once", "at-most-once", "exactly once", Go's RPC, thread synchronization
11 |  * Lecture 3: [Fault tolerance](l03-fault-tolerance.html): primary-backup replication, state transfer, "split-brain", Remus (NSDI 2008),  
12 |  * Lecture 4: [Flat datacenter storage](l04-more-primary-backup.html): flat datacenter storage, bisection bandwidth, striping
13 |  * Lecture 5: [Paxos](l05-paxos.html): Paxos, consensus algorithms
14 |     + [Paxos algorithm description](paxos-algorithm.html)
15 |  * Lecture 6: [Raft](l06-raft.html): Raft, a more understandable consensus algorithm
16 |  * Lecture 7: **Google Go** [_guest lecture_](l07-go.html) by Russ Cox
17 |  * Lecture 8: [Harp](l08-harp.html): distributed file system, "the UPS trick", witnesses
18 |  * Lecture 9: [IVY](l09-dist-comp-seq-consistency.html): distributed shared memory, sequential consistency
19 |  * Lecture 10: [TreadMarks](l10-treadmarks.html): userspace distributed shared memory system, vector timestamps, release consistency (lazy/eager), false sharing, write amplification
20 |  * Lecture 11: [Ficus](l11-ficus.html): optimistic concurrency control, vector timestamps, conflict resolution
21 |  * Lecture 12: [Bayou](l12-bayou.html): disconnected operation, eventual consistency, Bayou
22 |  * Lecture 13: [MapReduce](l13-mapreduce.html): MapReduce, scalability, performance
23 |  * Lecture 14: **Spark** [_guest lecture_](l14-spark.html) by Matei Zaharia: Resilient Distributed Datasets, Spark
24 |  * Lecture 15: **Spanner** [_guest lecture_](l15-spanner.html) by Wilson Hsieh, Google: Spanner, distributed database, clock skew
25 |  * Lecture 16: [Memcache at Facebook](l16-memcached.html): web app scalability, look-aside caches, Memcache
26 |  * Lecture 17: [PNUTS Yahoo!](l17-pnuts.html): distributed key-value store, atomic writes
27 |  * Lecture 18: [Dynamo](l18-dynamo.html): distributed key-value store, eventual consistency
28 |  * Lecture 19: **HubSpot** [_guest lecture_](l19-hubspot.html)
29 |  * Lecture 20: [Two phase commit (2PC)](l20-argus.html): two-phase commit, Argus
30 |  * Lecture 21: [Optimistic concurrency control](l21-thor.html)
31 |  * Lecture 22: [Peer-to-peer, trackerless Bittorrent and DHTs](l22-peer-to-peer.html): Chord, routing
32 |  * Lecture 23: [Bitcoin](l23-bitcoin.html): verifiable public ledgers, proof-of-work, double spending
33 | 
34 | Lectures form other years
35 | -------------------------
36 | 
37 |  * [Practical Byzantine Fault Tolerance (PBFT)](extra/pbft.html)
38 |     + Other years: [[2012]](original-notes/pbft-2012.txt), [[2011]](original-notes/pbft-2011.txt), [[2010]](original-notes/pbft-2010.txt), [[2009]](original-notes/pbft-2009.txt), [[2001]](original-notes/pbft-2001.txt), [[PPT]](original-notes/pbft.ppt)
39 | 
40 | Labs
41 | ----
42 | 
43 |  - Lab 1: MapReduce, [[assign]](lab1/index.html)
44 |  - Lab 2: A fault-tolerant key/value service, [[assign]](lab2/index.html), [[notes]](lab2/notes.html)
45 |  - Lab 3: Paxos-based Key/Value Service, [[assign]](lab3/index.html), [[notes]](lab3/notes.html)
46 |  - Lab 4: Sharded Key/Value Service, [[assign]](lab4/index.html), [[notes]](lab4/notes.html)
47 |  - Lab 5: Persistent Key/Value Service, [[assign]](lab5/index.html)
48 | 
49 | Papers
50 | ------
51 | 
52 | Papers we read in 6.824 ([directory here](papers/)):
53 | 
54 |  1. [MapReduce](papers/mapreduce.pdf)
55 |  2. [Remus](papers/remus.pdf)
56 |  3. [Flat datacenter storage](papers/fds.pdf)
57 |  4. [Paxos](papers/paxos-simple.pdf)
58 |  5. [Raft](papers/raft-atc14.pdf)
59 |  6. [Harp](papers/bliskov-harp.pdf)
60 |  7. [Shared virtual memory](papers/li-dsm.pdf)
61 |  8. [TreadMarks](papers/keleher-treadmarks.pdf)
62 |  9. [Ficus](papers/ficus.pdf)
63 |  10. [Bayou](papers/bayou-conflicts.pdf)
64 |  11. [Spark](papers/zaharia-spark.pdf)
65 |  12. [Spanner](papers/spanner.pdf)
66 |  13. [Memcached at Facebook](papers/memcache-fb.pdf)
67 |  14. [PNUTS](papers/cooper-pnuts.pdf)
68 |  15. [Dynamo](papers/dynamo.pdf)
69 |  16. [Akamai](papers/akamai.pdf)
70 |  17. [Argus](papers/argus88.pdf), [Guardians and actions](papers/guardians-and-actions-liskov.pdf)
71 |  18. [Kademlia](papers/kademlia.pdf)
72 |  19. [Bitcoin](papers/bitcoin.pdf)
73 |  20. [AnalogicFS](papers/katabi-analogicfs.pdf)
74 | 
75 | Other papers:
76 | 
77 |  1. [Impossibility of Distributed Consensus with One Faulty Process](papers/flp.pdf)
78 |     + See page 5, slide 10 [here](stumbled/flp-consensus.pdf) to understand Lemma 1 (commutativity) faster
79 |     + See [this article here](http://the-paper-trail.org/blog/a-brief-tour-of-flp-impossibility/) for an alternative explanation.
80 |  1. [Practical Byzantine Fault Tolerance (PBFT)](papers/pbft.pdf)
81 |     + See [discussion here on PBFT](http://the-paper-trail.org/blog/barbara-liskovs-turing-award-and-byzantine-fault-tolerance/#more-211).
82 | 
83 | Stumbled upon
84 | -------------
85 | 
86 |  1. [A brief history of consensus, 2PC and transaction commit](http://betathoughts.blogspot.com/2007/06/brief-history-of-consensus-2pc-and.html)
87 |  1. [Distributed systems theory for the distributed systems engineer](http://the-paper-trail.org/blog/distributed-systems-theory-for-the-distributed-systems-engineer/)
88 |  1. [Distributed Systems: For fun and Profit](http://book.mixu.net/distsys/)
89 |  1. [You can't choose CA out of CAP](https://codahale.com/you-cant-sacrifice-partition-tolerance/), or "You can't sacrifice partition tolerance"
90 |  1. [Notes on distributed systems for young bloods](https://www.somethingsimilar.com/2013/01/14/notes-on-distributed-systems-for-young-bloods/)
91 |  1. [Paxos Explained From Scratch](stumbled/paxos-explained-from-scratch.pdf)
92 | 
93 | Quizzes
94 | -------
95 | 
96 | Prep for quiz 1 [here](exams/quiz1/quiz1.html)
97 | 


--------------------------------------------------------------------------------
/code/ivy-code.txt:
--------------------------------------------------------------------------------
 1 | This is a copy of the code in Section 3.1 of Li and Hudak's Memory
 2 | Coherence in Shared Virtual Memory Systems (1986), somewhat simplified
 3 | and clarified. We've deleted the code for the case in which the
 4 | manager takes faults -- in this version, the manager does not run
 5 | application code. Messages are delivered reliably. There are no
 6 | failures.
 7 | 
 8 | ReadFaultHandler(PageNumber p):
 9 |   lock(ptable[p].lock)
10 |   ask manager for read access to p [RQ]
11 |   wait for someone to send me p's content [RD]
12 |   ptable[p].access = read
13 |   send confirmation to manager [RC]
14 |   unlock(ptable[p].lock)
15 | 
16 | ReadServer(PageNumber p, MachineID request_node):
17 |   lock(ptable[p].lock)
18 |   if I am owner of p:
19 |     ptable[p].access = read
20 |     send copy of p to request_node [RD]
21 |   unlock(ptable[p].lock)
22 | 
23 |   if I am manager:
24 |     lock(info[p].lock)
25 |     info[p].copy_set |= request_node
26 |     ask info[p].owner to send copy of p to request_node [RF]
27 |     wait for confirmation from request_node [RC]
28 |     unlock(info[p].lock)
29 | 
30 | WriteFaultHandler(PageNumber p):
31 |   lock(ptable[p].lock)
32 |   ask manager for write access to p [WQ]
33 |   wait for someone to send me p's content [WD]
34 |   ptable[p].access = write
35 |   send confirmation to manager [WC]
36 |   unlock(ptable[p].lock)
37 | 
38 | WriteServer(PageNumber p, MachineID request_node):
39 |   lock(ptable[p].lock)
40 |   if I am owner of p:
41 |     send copy of p to request_node [WD]
42 |     ptable[p].access = nil
43 |   unlock(ptable[p].lock)
44 | 
45 |   if I am manager:
46 |     lock(info[p].lock)
47 |     send invalidate to each node in info[p].copy_set [IV]
48 |     wait for all invalidate confirmations [IC]
49 |     info[p].copy_set = empty
50 |     ask info[p].owner to send copy of p to request_node [WF]
51 |     info[p].owner = request_node
52 |     wait for confirmation from request_node [WC]
53 |     unlock(info[p].lock)
54 | 
55 | InvalidateServer(PageNumber p):
56 |   # no lock...
57 |   ptable[p].access = nil
58 |   send confirmation to manager [IC]
59 | 


--------------------------------------------------------------------------------
/code/l-rpc.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | //
  4 | // toy RPC library
  5 | //
  6 | 
  7 | import "io"
  8 | import "fmt"
  9 | import "sync"
 10 | import "encoding/binary"
 11 | 
 12 | type ToyClient struct {
 13 |   mu sync.Mutex
 14 |   conn io.ReadWriteCloser    // connection to server
 15 |   xid int64                  // next unique request #
 16 |   pending map[int64]chan int32 // waiting calls [xid]
 17 | }
 18 | 
 19 | func MakeToyClient(conn io.ReadWriteCloser) *ToyClient {
 20 |   tc := &ToyClient{}
 21 |   tc.conn = conn
 22 |   tc.pending = map[int64]chan int32{}
 23 |   tc.xid = 1
 24 |   go tc.Listener()
 25 |   return tc
 26 | }
 27 | 
 28 | func (tc *ToyClient) WriteRequest(xid int64, procNum int32, arg int32) {
 29 |   binary.Write(tc.conn, binary.LittleEndian, xid)
 30 |   binary.Write(tc.conn, binary.LittleEndian, procNum)
 31 |   if err := binary.Write(tc.conn, binary.LittleEndian, arg); err != nil {
 32 |     fmt.Printf("xx %v\n", err)
 33 |   }
 34 | }
 35 | 
 36 | func (tc *ToyClient) ReadReply() (int64, int32) {
 37 |   var xid int64
 38 |   var arg int32
 39 |   binary.Read(tc.conn, binary.LittleEndian, &xid)
 40 |   binary.Read(tc.conn, binary.LittleEndian, &arg)
 41 |   return xid, arg
 42 | }
 43 | 
 44 | //
 45 | // client application uses Call() to make an RPC.
 46 | // client := MakeClient(server)
 47 | // reply := client.Call(procNum, arg)
 48 | //
 49 | func (tc *ToyClient) Call(procNum int32, arg int32) int32 {
 50 |   done := make(chan int32) // for tc.Listener()
 51 | 
 52 |   tc.mu.Lock()
 53 |   xid := tc.xid // allocate a unique xid
 54 |   tc.xid++
 55 |   tc.pending[xid] = done // for tc.Listener()
 56 |   tc.WriteRequest(xid, procNum, arg) // send to server
 57 |   tc.mu.Unlock()
 58 | 
 59 |   reply := <- done // wait for reply via tc.Listener()
 60 | 
 61 |   tc.mu.Lock()
 62 |   delete(tc.pending, xid)
 63 |   tc.mu.Unlock()
 64 | 
 65 |   return reply
 66 | }
 67 | 
 68 | //
 69 | // listen for client requests, call the handler,
 70 | // send back replies. runs as a background thread.
 71 | //
 72 | func (tc *ToyClient) Listener() {
 73 |   for {
 74 |     xid, reply := tc.ReadReply()
 75 |     tc.mu.Lock()
 76 |     ch, ok := tc.pending[xid]
 77 |     tc.mu.Unlock()
 78 |     if ok {
 79 |       ch <- reply
 80 |     }
 81 |   }
 82 | }
 83 | 
 84 | type ToyServer struct {
 85 |   mu sync.Mutex
 86 |   conn io.ReadWriteCloser // connection from client
 87 |   handlers map[int32]func(int32)int32 // procedures
 88 | }
 89 | 
 90 | func MakeToyServer(conn io.ReadWriteCloser) *ToyServer {
 91 |   ts := &ToyServer{}
 92 |   ts.conn = conn
 93 |   ts.handlers = map[int32](func(int32)int32){}
 94 |   go ts.Dispatcher()
 95 |   return ts
 96 | }
 97 | 
 98 | func (ts *ToyServer) WriteReply(xid int64, arg int32) {
 99 |   binary.Write(ts.conn, binary.LittleEndian, xid)
100 |   binary.Write(ts.conn, binary.LittleEndian, arg)
101 | }
102 | 
103 | func (ts *ToyServer) ReadRequest() (int64, int32, int32) {
104 |   var xid int64
105 |   var procNum int32
106 |   var arg int32
107 |   binary.Read(ts.conn, binary.LittleEndian, &xid)
108 |   binary.Read(ts.conn, binary.LittleEndian, &procNum)
109 |   binary.Read(ts.conn, binary.LittleEndian, &arg)
110 |   return xid, procNum, arg
111 | }
112 | 
113 | //
114 | // listen for client requests,
115 | // dispatch them to the right handler,
116 | // send back replies.
117 | //
118 | func (ts *ToyServer) Dispatcher() {
119 |   for {
120 |     xid, procNum, arg := ts.ReadRequest()
121 |     ts.mu.Lock()
122 |     fn, ok := ts.handlers[procNum]
123 |     ts.mu.Unlock()
124 |     go func() {
125 |       var reply int32
126 |       if ok {
127 |         reply = fn(arg)
128 |       }
129 |       ts.mu.Lock()
130 |       ts.WriteReply(xid, reply)
131 |       ts.mu.Unlock()
132 |     }()
133 |   }
134 | }
135 | 
136 | type Pair struct {
137 |   r *io.PipeReader
138 |   w *io.PipeWriter
139 | }
140 | func (p Pair) Read(data []byte) (int, error) {
141 |   return p.r.Read(data)
142 | }
143 | func (p Pair) Write(data []byte) (int, error) {
144 |   return p.w.Write(data)
145 | }
146 | func (p Pair) Close() error {
147 |   p.r.Close()
148 |   return p.w.Close()
149 | }
150 | 
151 | func main() {
152 |   r1, w1 := io.Pipe()
153 |   r2, w2 := io.Pipe()
154 |   cp := Pair{r : r1, w : w2}
155 |   sp := Pair{r : r2, w : w1}
156 |   tc := MakeToyClient(cp)
157 |   ts := MakeToyServer(sp)
158 |   ts.handlers[22] = func(a int32) int32 { return a+1 }
159 | 
160 |   reply := tc.Call(22, 100)
161 |   fmt.Printf("Call(22, 100) -> %v\n", reply)
162 | }
163 | 


--------------------------------------------------------------------------------
/echo-disclaimer.sh:
--------------------------------------------------------------------------------
1 | echo -e "**Note:** These lecture notes were slightly modified from the ones posted on the
2 | 6.824 [course website](http://nil.csail.mit.edu/6.824/2015/schedule.html)
3 | from Spring 2015.\n"
4 | 
5 | 


--------------------------------------------------------------------------------
/exams/pdfs/q02-1-ans.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/pdfs/q02-1-ans.pdf


--------------------------------------------------------------------------------
/exams/pdfs/q04-1-ans.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/pdfs/q04-1-ans.pdf


--------------------------------------------------------------------------------
/exams/pdfs/q04-2-ans.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/pdfs/q04-2-ans.pdf


--------------------------------------------------------------------------------
/exams/pdfs/q05-1-ans.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/pdfs/q05-1-ans.pdf


--------------------------------------------------------------------------------
/exams/pdfs/q05-2-ans.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/pdfs/q05-2-ans.pdf


--------------------------------------------------------------------------------
/exams/pdfs/q06-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/pdfs/q06-1.pdf


--------------------------------------------------------------------------------
/exams/pdfs/q07-1-ans.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/pdfs/q07-1-ans.pdf


--------------------------------------------------------------------------------
/exams/pdfs/q07-2-ans.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/pdfs/q07-2-ans.pdf


--------------------------------------------------------------------------------
/exams/pdfs/q09-1-ans.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/pdfs/q09-1-ans.pdf


--------------------------------------------------------------------------------
/exams/pdfs/q09-2-ans.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/pdfs/q09-2-ans.pdf


--------------------------------------------------------------------------------
/exams/pdfs/q10-1-ans.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/pdfs/q10-1-ans.pdf


--------------------------------------------------------------------------------
/exams/pdfs/q10-2-ans.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/pdfs/q10-2-ans.pdf


--------------------------------------------------------------------------------
/exams/pdfs/q11-1-ans.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/pdfs/q11-1-ans.pdf


--------------------------------------------------------------------------------
/exams/pdfs/q11-2-ans.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/pdfs/q11-2-ans.pdf


--------------------------------------------------------------------------------
/exams/pdfs/q12-1-ans.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/pdfs/q12-1-ans.pdf


--------------------------------------------------------------------------------
/exams/pdfs/q12-2-ans.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/pdfs/q12-2-ans.pdf


--------------------------------------------------------------------------------
/exams/pdfs/q13-1-ans.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/pdfs/q13-1-ans.pdf


--------------------------------------------------------------------------------
/exams/pdfs/q13-2-ans.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/pdfs/q13-2-ans.pdf


--------------------------------------------------------------------------------
/exams/pdfs/q14-1-ans.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/pdfs/q14-1-ans.pdf


--------------------------------------------------------------------------------
/exams/pdfs/q14-2-ans.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/pdfs/q14-2-ans.pdf


--------------------------------------------------------------------------------
/exams/pdfs/q6-02.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/pdfs/q6-02.pdf


--------------------------------------------------------------------------------
/exams/quiz1/qs/q1-2014/paxos.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/quiz1/qs/q1-2014/paxos.png


--------------------------------------------------------------------------------
/exams/quiz1/qs/q1-2014/q14-1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/quiz1/qs/q1-2014/q14-1-1.png


--------------------------------------------------------------------------------
/exams/quiz1/qs/q1-2014/q14-2-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/quiz1/qs/q1-2014/q14-2-2.png


--------------------------------------------------------------------------------
/exams/quiz1/qs/q1-2014/q14-3-3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/quiz1/qs/q1-2014/q14-3-3.pdf


--------------------------------------------------------------------------------
/exams/quiz1/qs/q1-2014/q14-4-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/quiz1/qs/q1-2014/q14-4-4.png


--------------------------------------------------------------------------------
/exams/quiz1/qs/q1-2014/q14-4-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/quiz1/qs/q1-2014/q14-4-5.png


--------------------------------------------------------------------------------
/exams/quiz1/qs/q1-2014/q14-5-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/quiz1/qs/q1-2014/q14-5-6.png


--------------------------------------------------------------------------------
/exams/quiz1/qs/q1-2014/q14-5-7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/quiz1/qs/q1-2014/q14-5-7.png


--------------------------------------------------------------------------------
/exams/quiz1/qs/q1-2014/q14-5-8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/quiz1/qs/q1-2014/q14-5-8.png


--------------------------------------------------------------------------------
/exams/quiz1/qs/q1-2014/q14-6-9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/quiz1/qs/q1-2014/q14-6-9.png


--------------------------------------------------------------------------------
/exams/quiz1/qs/q1-2014/q14-7-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/quiz1/qs/q1-2014/q14-7-10.png


--------------------------------------------------------------------------------
/exams/quiz1/qs/q1-2014/q14-7-10a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/quiz1/qs/q1-2014/q14-7-10a.png


--------------------------------------------------------------------------------
/exams/quiz1/qs/q1-2014/q14-7-10b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/quiz1/qs/q1-2014/q14-7-10b.png


--------------------------------------------------------------------------------
/exams/quiz1/qs/q1-2014/q14-7-10c.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/quiz1/qs/q1-2014/q14-7-10c.png


--------------------------------------------------------------------------------
/exams/quiz1/qs/q1-2014/q14-7-10d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/quiz1/qs/q1-2014/q14-7-10d.png


--------------------------------------------------------------------------------
/exams/quiz1/qs/q1-2014/q14-7-10e.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/quiz1/qs/q1-2014/q14-7-10e.png


--------------------------------------------------------------------------------
/exams/quiz2/quiz2.html:
--------------------------------------------------------------------------------
  1 | <h1>2006, Quiz 2</h1>
  2 | 
  3 | <h2>2PC</h2>
  4 | 
  5 | <h3>Q1</h3>
  6 | 
  7 | <p>If all Si's say I did not send a NO, that does not imply they said "yes". It could
  8 | be that some of the Si's did not get the PREPARE yet => can't commit</p>
  9 | 
 10 | <h3>Q2</h3>
 11 | 
 12 | <h1>2009, Quiz 1</h1>
 13 | 
 14 | <h2>Bayou</h2>
 15 | 
 16 | <h3>Question 4</h3>
 17 | 
 18 | <p>If David syncs with the primary first, or if David's logical timestamp is higher
 19 | that MIT Daycare's (either due to the clock or node ID being higher) and they
 20 | sync and then one of them talks to the primary.</p>
 21 | 
 22 | <h1>2010, Quiz 1</h1>
 23 | 
 24 | <h2>Bayou</h2>
 25 | 
 26 | <h3>Question 7</h3>
 27 | 
 28 | <p>A: a1t0
 29 | B:      b1t1
 30 | C:           c1t2</p>
 31 | 
 32 | <p>B syncs with C, C commits with S => b1 commits first
 33 | A syncs with B => A's tentative schedule is: a1t0 b1t1 
 34 | A syncs with S => A's schedule changes to: b1t1 a1t0</p>
 35 | 
 36 | <h1>2011, Quiz 1</h1>
 37 | 
 38 | <h2>Bayou</h2>
 39 | 
 40 | <h3>Question 4</h3>
 41 | 
 42 | <p>The logical clock scheme would work better than using real-time clocks when
 43 | those real time clocks are out of sync:</p>
 44 | 
 45 | <p>N1 real time clock says 9:00am (actual time is 9:00am)
 46 | N2 real time clock says 10:00am (actual time is 9:00am)</p>
 47 | 
 48 | <p>If N1 sends an update of a file F to N2, then N2 will ignore it because its clock
 49 | is too far ahead.</p>
 50 | 
 51 | <h3>Question 5</h3>
 52 | 
 53 | <p>Conflict resolution. Resolving update conflicts.</p>
 54 | 
 55 | <h3>Question 6</h3>
 56 | 
 57 | <p>A: a1 <br />
 58 | B:    b1</p>
 59 | 
 60 | <p>B commits with S => b1 gets 1
 61 | B syncs with A => a1, b1
 62 | A syncs with S => updates get reordered b1, a1</p>
 63 | 
 64 | <h3>Question 7</h3>
 65 | 
 66 | <p>A: a1
 67 | B:    b1
 68 | C:       c1</p>
 69 | 
 70 | <p>pairwise sync (A-B, B-C)</p>
 71 | 
 72 | <p>A: a1, b1
 73 | B: a1, b1, c1
 74 | C: b1, c1</p>
 75 | 
 76 | <p>A creates a1
 77 | B syncs with A,
 78 | B creates b1 after a1
 79 | C syncs with B
 80 | C creates c1 after b1</p>
 81 | 
 82 | <p>C syncs first with S => b1 gets CSN 1, c1 gets CSN 2
 83 | B syncs first with S => b1 is already synced and a1 gets CSN 3 (weird)</p>
 84 | 
 85 | <p>Their answer: server i reserves a room for 10/11/12pm w/ TS i after syncing with 
 86 | server (i-1), which did the same before him.</p>
 87 | 
 88 | <h3>Question 8</h3>
 89 | 
 90 | <p>The guy with the highest node ID will see all of its updates constantly being
 91 | rescheduled everytime he syncs with someone. A lot of his updates could fail as
 92 | a result?</p>
 93 | 
 94 | <h1>2011, Quiz 2</h1>
 95 | 
 96 | <h2>Argus, two-phase commit</h2>
 97 | 
 98 | <h3>Q3</h3>
 99 | 
100 | <p>2PC does not provide fault tolerance nor availability: if one server is down,
101 | no one can proceed. Ben's system will not be very available.</p>
102 | 
103 | <h1>2012, Quiz 1</h1>
104 | 
105 | <h2>Argus, 2PC</h2>
106 | 
107 | <h2>Bayou</h2>
108 | 
109 | <h3>Question 3</h3>
110 | 
111 | <p>If Victor sends just his update to the primary and Vera's didnt make it there yet, 
112 | then Victor wins: his update gets the smaller CSN. If Victor sends both updates
113 | to the primary, then the primary will given them CSNs in order, and Vera's update
114 | will get the 1st CSN => Vera wins. I think the way Bayou works will have Victor
115 | send both of his updated to the primary => vera wins</p>
116 | 
117 | <h3>Question 4</h3>
118 | 
119 | <p>Write <code>x</code> has vv: <code>[s1: 1, s2: 2]</code>
120 | Write <code>y</code> has vv: <code>[s1: 2, s2: 1]</code></p>
121 | 
122 | <h3>Question 5</h3>
123 | 
124 | <p>In Bayou, it would not be okay: the reason we used logical clocks was so that
125 | all servers apply the tentative updates in the same order.</p>
126 | 
127 | <p>...but the primary actually orders/commits updates in the order they arrive, so
128 | it seems that it shouldn't matter how clients apply their tentative updates. They
129 | will end up disagreeing more often with this scheme, but ultimately, the primary
130 | will make sure they all agree up to the last committed update.</p>
131 | 
132 | <h1>2013, Quiz 2</h1>
133 | 
134 | <h2>Bayou</h2>
135 | 
136 | <h3>Question 2</h3>
137 | 
138 | <p>A: Neha, [-, 1, A]
139 | A: Robert [-, 2, A]
140 | N: Charles [-, 1, N]
141 | A &lt;-> N</p>
142 | 
143 | <p>Yes. Say their node IDs are A and N, s.t. A &lt; N, Then they will all display:</p>
144 | 
145 | <p>Neha, Charles, Robert</p>
146 | 
147 | <p>Because N's update will be after A's Neha update but before A's Robert update.
148 | 1 &lt; 2 and A &lt; N.</p>
149 | 
150 | <h3>Question 3</h3>
151 | 
152 | <p>If no seats are reserved, then the only seat assignment that is possible is 
153 | Neha, Charles, Robert.</p>
154 | 
155 | <p>If the question refers to all <em>committed</em> seat assignments that are possible,
156 | then the Neha and Robert need to maintain their causal ordering, while Charles
157 | can be anywhere in between them, depending on what time N syncs with S.</p>
158 | 
159 | <h3>Question 4</h3>
160 | 
161 | <p>Either one could be right. If Agent Ack (or another agent) committed on S, then
162 | seat 1 is reserved (because seats are reserved in order). If no agent committed
163 | on S, then Professor Strongly Consistent has a point: Agent Ack could be the
164 | first one to commit on S and get the professor seat #1. The remaining question
165 | is if Agent Ack can reach the primary S.</p>
166 | 
167 | <p>Oh fudge, Sack != Ack. Poor name choosing...</p>
168 | 
169 | <h1>2014, Quiz 2</h1>
170 | 
171 | <h2>Bayou</h2>
172 | 
173 | <h3>Question 10</h3>
174 | 
175 | <p>H2's local timestamp starts at 0 and H2 synchronized with H1, whose update had
176 | timestamp 1 => H2's local timestamp will be updated to 1 => H2's update timestamp
177 | will be updated to 2</p>
178 | 
179 | <h3>Question 11</h3>
180 | 
181 | <p>After synchronizing with S:</p>
182 | 
183 | <p>H1: [1, 1, H1]
184 | H2: [2, 2, H2]</p>
185 | 
186 | <p>Not sure if the central server S also updates H1/H2's logical clocks. I don't
187 | think so.</p>
188 | 
189 | <p>Fudge... H3 != H1</p>
190 | 
191 | <p>H3 syncs with S => H3 gets [1, 1, H3]
192 | H1 talks to H2 => H2 gets [-, 2, H2]
193 | H2 syncs with S => H2 gets [2, 2, H2]
194 | H1 syncs with S => H1 gets [3, 1, H1]</p>
195 | 
196 | <p>Still disagreeing with their answer (H2 gets CSN 3 and H1 gets CSN 2). For some
197 | reason, they assume H1 gets there first. Oh... and it does, because H1 synced
198 | with H2, and H2 syncs with S first, but will include H1's update as the first
199 | one.</p>
200 | 
201 | <h3>Question 12</h3>
202 | 
203 | <p>Setup:
204 | H3 synced with S => 10am Ben committed
205 | H2 synced with S => 11am Alice committed
206 | H1 synced with S => 10am Bob was rejected. Maybe it goes to 12pm?</p>
207 | 
208 | <p>Actions:
209 | H4 syncs with H2 => H4's clock becomes 2
210 | H4 syncs with H3 => H4's clock stays 2. H3's clock becomes 2?\</p>
211 | 
212 | <p>After syncing with H2, H4 gets 10am Ben, 11am Alice
213 | After syncing with H3, who's behind, calendar stays the same.</p>
214 | 
215 | <h3>Question 13</h3>
216 | 
217 | <p>Bayou was developed so that users can operate in offline mode. Paxos wouldn't work
218 | here at all when a majority of user's nodes are offline. If the question asked
219 | about using Paxos to replicate the primary, then sure, yes, go ahead. But
220 | using paxos to have the client's machines agree on their operations' order will
221 | not work well.</p>
222 | 


--------------------------------------------------------------------------------
/exams/quiz2/quiz2.md:
--------------------------------------------------------------------------------
  1 | 2006, Quiz 2
  2 | ============
  3 | 
  4 | 2PC
  5 | ---
  6 | 
  7 | ### Q1
  8 | 
  9 | If all Si's say I did not send a NO, that does not imply they said "yes". It could
 10 | be that some of the Si's did not get the PREPARE yet => can't commit
 11 | 
 12 | ### Q2
 13 | 
 14 | 
 15 | 
 16 | 2009, Quiz 1
 17 | ============
 18 | 
 19 | Bayou
 20 | -----
 21 | 
 22 | ### Question 4
 23 | 
 24 | If David syncs with the primary first, or if David's logical timestamp is higher
 25 | that MIT Daycare's (either due to the clock or node ID being higher) and they
 26 | sync and then one of them talks to the primary.
 27 | 
 28 | 2010, Quiz 1
 29 | ============
 30 | 
 31 | Bayou
 32 | -----
 33 | 
 34 | ### Question 7
 35 | 
 36 | A: a1t0
 37 | B:      b1t1
 38 | C:           c1t2
 39 | 
 40 | B syncs with C, C commits with S => b1 commits first
 41 | A syncs with B => A's tentative schedule is: a1t0 b1t1 
 42 | A syncs with S => A's schedule changes to: b1t1 a1t0
 43 | 
 44 | 2011, Quiz 1
 45 | ============
 46 | 
 47 | Bayou
 48 | -----
 49 | 
 50 | ### Question 4
 51 | 
 52 | The logical clock scheme would work better than using real-time clocks when
 53 | those real time clocks are out of sync:
 54 | 
 55 | N1 real time clock says 9:00am (actual time is 9:00am)
 56 | N2 real time clock says 10:00am (actual time is 9:00am)
 57 | 
 58 | If N1 sends an update of a file F to N2, then N2 will ignore it because its clock
 59 | is too far ahead.
 60 | 
 61 | ### Question 5
 62 | 
 63 | Conflict resolution. Resolving update conflicts.
 64 | 
 65 | ### Question 6
 66 | 
 67 | A: a1   
 68 | B:    b1
 69 | 
 70 | B commits with S => b1 gets 1
 71 | B syncs with A => a1, b1
 72 | A syncs with S => updates get reordered b1, a1
 73 | 
 74 | ### Question 7
 75 | 
 76 | A: a1
 77 | B:    b1
 78 | C:       c1
 79 | 
 80 | pairwise sync (A-B, B-C)
 81 | 
 82 | A: a1, b1
 83 | B: a1, b1, c1
 84 | C: b1, c1
 85 | 
 86 | A creates a1
 87 | B syncs with A,
 88 | B creates b1 after a1
 89 | C syncs with B
 90 | C creates c1 after b1
 91 | 
 92 | C syncs first with S => b1 gets CSN 1, c1 gets CSN 2
 93 | B syncs first with S => b1 is already synced and a1 gets CSN 3 (weird)
 94 | 
 95 | Their answer: server i reserves a room for 10/11/12pm w/ TS i after syncing with 
 96 | server (i-1), which did the same before him.
 97 | 
 98 | ### Question 8
 99 | 
100 | The guy with the highest node ID will see all of its updates constantly being
101 | rescheduled everytime he syncs with someone. A lot of his updates could fail as
102 | a result?
103 | 
104 | 2011, Quiz 2
105 | ============
106 | 
107 | Argus, two-phase commit
108 | -----------------------
109 | 
110 | 
111 | ### Q3
112 | 
113 | 2PC does not provide fault tolerance nor availability: if one server is down,
114 | no one can proceed. Ben's system will not be very available.
115 | 
116 | 2012, Quiz 1
117 | ============
118 | 
119 | Argus, 2PC
120 | ----------
121 | 
122 | Bayou
123 | -----
124 | 
125 | ### Question 3
126 | 
127 | If Victor sends just his update to the primary and Vera's didnt make it there yet, 
128 | then Victor wins: his update gets the smaller CSN. If Victor sends both updates
129 | to the primary, then the primary will given them CSNs in order, and Vera's update
130 | will get the 1st CSN => Vera wins. I think the way Bayou works will have Victor
131 | send both of his updated to the primary => vera wins
132 | 
133 | ### Question 4
134 | 
135 | Write `x` has vv: `[s1: 1, s2: 2]`
136 | Write `y` has vv: `[s1: 2, s2: 1]`
137 | 
138 | ### Question 5
139 | 
140 | In Bayou, it would not be okay: the reason we used logical clocks was so that
141 | all servers apply the tentative updates in the same order.
142 | 
143 | ...but the primary actually orders/commits updates in the order they arrive, so
144 | it seems that it shouldn't matter how clients apply their tentative updates. They
145 | will end up disagreeing more often with this scheme, but ultimately, the primary
146 | will make sure they all agree up to the last committed update.
147 | 
148 | 2013, Quiz 2
149 | ============
150 | 
151 | Bayou
152 | -----
153 | 
154 | ### Question 2
155 | 
156 | A: Neha, [-, 1, A]
157 | A: Robert [-, 2, A]
158 | N: Charles [-, 1, N]
159 | A <-> N
160 | 
161 | Yes. Say their node IDs are A and N, s.t. A < N, Then they will all display:
162 | 
163 | Neha, Charles, Robert
164 | 
165 | Because N's update will be after A's Neha update but before A's Robert update.
166 | 1 < 2 and A < N.
167 | 
168 | ### Question 3
169 | 
170 | If no seats are reserved, then the only seat assignment that is possible is 
171 | Neha, Charles, Robert.
172 | 
173 | If the question refers to all _committed_ seat assignments that are possible,
174 | then the Neha and Robert need to maintain their causal ordering, while Charles
175 | can be anywhere in between them, depending on what time N syncs with S.
176 | 
177 | ### Question 4
178 | 
179 | Either one could be right. If Agent Ack (or another agent) committed on S, then
180 | seat 1 is reserved (because seats are reserved in order). If no agent committed
181 | on S, then Professor Strongly Consistent has a point: Agent Ack could be the
182 | first one to commit on S and get the professor seat #1. The remaining question
183 | is if Agent Ack can reach the primary S.
184 | 
185 | Oh fudge, Sack != Ack. Poor name choosing...
186 | 
187 | 2014, Quiz 2
188 | ============
189 | 
190 | Bayou
191 | -----
192 | 
193 | ### Question 10
194 | 
195 | H2's local timestamp starts at 0 and H2 synchronized with H1, whose update had
196 | timestamp 1 => H2's local timestamp will be updated to 1 => H2's update timestamp
197 | will be updated to 2
198 | 
199 | ### Question 11
200 | 
201 | After synchronizing with S:
202 | 
203 | H1: [1, 1, H1]
204 | H2: [2, 2, H2]
205 | 
206 | Not sure if the central server S also updates H1/H2's logical clocks. I don't
207 | think so.
208 | 
209 | Fudge... H3 != H1
210 | 
211 | H3 syncs with S => H3 gets [1, 1, H3]
212 | H1 talks to H2 => H2 gets [-, 2, H2]
213 | H2 syncs with S => H2 gets [2, 2, H2]
214 | H1 syncs with S => H1 gets [3, 1, H1]
215 | 
216 | Still disagreeing with their answer (H2 gets CSN 3 and H1 gets CSN 2). For some
217 | reason, they assume H1 gets there first. Oh... and it does, because H1 synced
218 | with H2, and H2 syncs with S first, but will include H1's update as the first
219 | one.
220 | 
221 | ### Question 12
222 | 
223 | Setup:
224 | H3 synced with S => 10am Ben committed
225 | H2 synced with S => 11am Alice committed
226 | H1 synced with S => 10am Bob was rejected. Maybe it goes to 12pm?
227 | 
228 | Actions:
229 | H4 syncs with H2 => H4's clock becomes 2
230 | H4 syncs with H3 => H4's clock stays 2. H3's clock becomes 2?\
231 | 
232 | After syncing with H2, H4 gets 10am Ben, 11am Alice
233 | After syncing with H3, who's behind, calendar stays the same.
234 | 
235 | ### Question 13
236 | 
237 | Bayou was developed so that users can operate in offline mode. Paxos wouldn't work
238 | here at all when a majority of user's nodes are offline. If the question asked
239 | about using Paxos to replicate the primary, then sure, yes, go ahead. But
240 | using paxos to have the client's machines agree on their operations' order will
241 | not work well.
242 | 
243 | 


--------------------------------------------------------------------------------
/exams/raft.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/exams/raft.png


--------------------------------------------------------------------------------
/extra/.vimrc:
--------------------------------------------------------------------------------
1 | ../.vimrc


--------------------------------------------------------------------------------
/extra/bayou.ppt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/extra/bayou.ppt


--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
1 | README.html


--------------------------------------------------------------------------------
/index.md:
--------------------------------------------------------------------------------
1 | README.md


--------------------------------------------------------------------------------
/l07-go.html:
--------------------------------------------------------------------------------
  1 | <h1>Russ Cox's lecture on Go</h1>
  2 | 
  3 | <h2>Why Go?</h2>
  4 | 
  5 | <ul>
  6 | <li>an answer to the problems of scalability at Google
  7 | <ul>
  8 | <li><code>10^6+</code> machines design point</li>
  9 | <li>it's routine to be running on 1000 machines</li>
 10 | <li>constantly writing programs that coordinate with each other
 11 | <ul>
 12 | <li>sometimes MapReduce works, other times it doesn't</li>
 13 | </ul></li>
 14 | </ul></li>
 15 | </ul>
 16 | 
 17 | <h2>Who uses Go at Google</h2>
 18 | 
 19 | <ul>
 20 | <li>SPDY proxy for Chrome on mobile devices uses a Go-written <em>Data Compression Proxy</em></li>
 21 | <li>dl.google.com</li>
 22 | <li>YouTube MySQL balancer</li>
 23 | <li>the target is network servers, but it's a great gen. purp. language</li>
 24 | <li>Bitbucket, bitly, GitHub, Dropbox, MongoDB, Mozilla services, NY Times, etc.</li>
 25 | </ul>
 26 | 
 27 | <h2>Concurrency</h2>
 28 | 
 29 | <ul>
 30 | <li>"Communicating Sequential Processes", by Hoare, 1978
 31 | <ul>
 32 | <li>strongly encouraged to read</li>
 33 | <li>in some sense, a generalization of UNIX pipelines</li>
 34 | </ul></li>
 35 | <li>Bell Labs had some languages developed for concurrency in 80's, 90's:
 36 | <ul>
 37 | <li>Pan, Promela, Newsqueak, Alef, Limbo, Libthread, Concurrent ML</li>
 38 | </ul></li>
 39 | <li>Google developed Go in the 2000s</li>
 40 | </ul>
 41 | 
 42 | <h3>There's no goroutine IDs</h3>
 43 | 
 44 | <ul>
 45 | <li>"There's no goroutine IDs, so I can't kill my threads"
 46 | <ul>
 47 | <li>This is what channels are for: just tell your thread via a channel to shut itself off</li>
 48 | <li>Also, it's kind of "antisocial" to kill them.
 49 | <ul>
 50 | <li>What we mean is that your program is prolly not gonna work very well if you keep killing your threads like that</li>
 51 | </ul></li>
 52 | </ul></li>
 53 | </ul>
 54 | 
 55 | <h3>Channels vs. Mutexes</h3>
 56 | 
 57 | <ul>
 58 | <li>if you need a mutex, use a mutex</li>
 59 | <li>if you need condition variable, think about using a channel instead</li>
 60 | <li>don't communicate by sharing memory, you share memory by communicating</li>
 61 | </ul>
 62 | 
 63 | <h3>Network channels</h3>
 64 | 
 65 | <ul>
 66 | <li>it'd be great to have the equivalent for a network channel</li>
 67 | <li>if you take local abstractions (like channels) and use them in a new
 68 | context like a network, ignoring failure modes (etc), then you're gonna
 69 | run into trouble</li>
 70 | </ul>
 71 | 
 72 | <h2>Scale of engineering efforts</h2>
 73 | 
 74 | <p>In 2011, Google had:</p>
 75 | 
 76 | <ul>
 77 | <li>5000+ developers </li>
 78 | <li>20+ changes per minute</li>
 79 | <li>50% code base changes every month (files? not lines probably)</li>
 80 | <li>50 million test cases executed per day</li>
 81 | <li>single code tree projects</li>
 82 | </ul>
 83 | 
 84 | <p>A new language was needed to fix the problems that other languages had with software engineering at this scale</p>
 85 | 
 86 | <p>The scale of compilation matters. 
 87 |  - When you compile a package A that depends on B, most (all?) languages need to compile B first
 88 |  - Go doesn't.
 89 |  - Dependencies like these at the scale of Google projects slow down compilation if you use a traditional language
 90 |    + gets worse with "deeper" dependencies <code>A-&gt;B-&gt;C-&gt;D-&gt;...</code>
 91 |  - <em>Example:</em> at some point they found a postscript interpreter compiled in a server binary for no reason due to weird deps</p>
 92 | 
 93 | <h3>Interfaces vs. inheritance</h3>
 94 | 
 95 | <ul>
 96 | <li>inhertance hierarchies are hard to get right and if you don't they are hard to change later</li>
 97 | <li>interfaces are much more informal and clearer about who owns and supplies what parts of the program</li>
 98 | </ul>
 99 | 
100 | <h3>Readability and simplicity</h3>
101 | 
102 | <ul>
103 | <li>Dick Gabriel quote: 
104 | <blockquote>
105 |   <p>"I'm always delighted by the light touch and stillness of early programming languages. Not much text; a lot gets done. Old programs read like quiet conversations between a well-spoken research worker and a well-studied mechanical colleague, not as a debate with a compiler. Who'd have guessed sophistication bought such noise?"</li>
106 | <li>Simplify syntax</li>
107 | <li>Avoid cleverness: ternary operators, macros</li>
108 | <li>Don't let code writing be like "arguing with your compiler"</li>
109 | <li>Don't want to puzzle through code 6 months later</li>
110 | </ul></p>
111 | 
112 | <h2>Design criteria</h2>
113 | </blockquote>
114 | 
115 | <ul>
116 | <li>started by Rob Pike, Robert Griesemer and Ken Thompson in late 2007</li>
117 | <li>Russ Cox, Ian Lance Taylor joined in mid-2008</li>
118 | <li>design by consensus (everyone could veto a feature, if they didn't want it)</li>
119 | </ul>
120 | 
121 | <h3>Generics</h3>
122 | 
123 | <ul>
124 | <li>Russ: "Don't use <code>*list.List</code>, you almost never need them. Use slices."
125 | <ul>
126 | <li>Generics are not bad, just hard to do right.
127 | <ul>
128 | <li>Early designers for Java generics also agreed and warned Go designers to be careful</li>
129 | <li>Seems like they regretted getting into that business</li>
130 | </ul></li>
131 | </ul></li>
132 | </ul>
133 | 
134 | <h3>Enginering tools</h3>
135 | 
136 | <ul>
137 | <li>when you have millions of lines of code, you need mechanical help
138 | <ul>
139 | <li>like changing an API</li>
140 | </ul></li>
141 | <li>Go designed to be easy to parse (not like C++)</li>
142 | <li>standard formatter</li>
143 | <li>Means you can't tell a mechanical change from a manual change
144 | <ul>
145 | <li>enables automated rewrites of code</li>
146 | </ul></li>
147 | </ul>
148 | 
149 | <h3>More automation</h3>
150 | 
151 | <ul>
152 | <li>fix code for API updates
153 | <ul>
154 | <li>early Go versions API changed a lot</li>
155 | <li>Google had a rewriter that would fix your code which used the changed APIs</li>
156 | </ul></li>
157 | <li>renaming struct fields, variables w/ conflict resolution</li>
158 | <li>moving packages</li>
159 | <li>splitting of packages</li>
160 | <li>code cleanup</li>
161 | <li>change C code to Go</li>
162 | <li>global analysis that figure out what are all the implementors of an interface for instance</li>
163 | </ul>
164 | 
165 | <h2>State of Go</h2>
166 | 
167 | <ul>
168 | <li>Go 1.4 released in Decembeer 2014</li>
169 | <li>Go 1.5 has toolchain implemented in Go, not in C
170 | <ul>
171 | <li>concurrent GC</li>
172 | <li>Go for mobile devices</li>
173 | <li>Go on PowerPC, ARM64</li>
174 | </ul></li>
175 | <li>Lots of people use it</li>
176 | <li>Go conferences outside of Google/Go</li>
177 | </ul>
178 | 
179 | <h2>Q&amp;A</h2>
180 | 
181 | <ul>
182 | <li>Go vs C/C++
183 | <ul>
184 | <li>Go is garbage collected, biggest difference, so slower</li>
185 | <li>Go can be faster than Java sometimes</li>
186 | <li>once you're aware of that, you can write code that
187 | runs faster than C/C++ code</li>
188 | <li>no reason that code that doesn't allocate memory
189 | shouldn't run as fast as C/C++</li>
190 | </ul></li>
191 | <li>Goal to use Go outside Google?
192 | <ul>
193 | <li>Yes! Otherwise the language would die?</li>
194 | <li>You get a breadth of experts that give you advice and write tools, etc.
195 | <ul>
196 | <li>C++ memory model guy gave feedback on Go memory model</li>
197 | <li>Very usefl</li>
198 | </ul></li>
199 | <li>Not trying to replace anything like language X
200 | <ul>
201 | <li>but they were using C/C++ and didn't want to anymore</li>
202 | <li>however Python and Ruby users are switching to Go more</li>
203 | <li>Go feels just as light but statically type checked</li>
204 | </ul></li>
205 | </ul></li>
206 | <li>Studies about benefits of Go?
207 | <ul>
208 | <li>not a lot of data collected</li>
209 | </ul></li>
210 | </ul>
211 | 


--------------------------------------------------------------------------------
/l07-go.md:
--------------------------------------------------------------------------------
  1 | Russ Cox's lecture on Go
  2 | ========================
  3 | 
  4 | Why Go?
  5 | ------
  6 | 
  7 |  - an answer to the problems of scalability at Google
  8 |    + `10^6+` machines design point
  9 |    + it's routine to be running on 1000 machines
 10 |    + constantly writing programs that coordinate with each other
 11 |      - sometimes MapReduce works, other times it doesn't
 12 | 
 13 | Who uses Go at Google
 14 | ---------------------
 15 | 
 16 |  - SPDY proxy for Chrome on mobile devices uses a Go-written _Data Compression Proxy_
 17 |  - dl.google.com
 18 |  - YouTube MySQL balancer
 19 |  - the target is network servers, but it's a great gen. purp. language
 20 |  - Bitbucket, bitly, GitHub, Dropbox, MongoDB, Mozilla services, NY Times, etc.
 21 | 
 22 | Concurrency
 23 | -----------
 24 |  
 25 |  - "Communicating Sequential Processes", by Hoare, 1978
 26 |    + strongly encouraged to read
 27 |    + in some sense, a generalization of UNIX pipelines
 28 |  - Bell Labs had some languages developed for concurrency in 80's, 90's:
 29 |    + Pan, Promela, Newsqueak, Alef, Limbo, Libthread, Concurrent ML
 30 |  - Google developed Go in the 2000s
 31 | 
 32 | ### There's no goroutine IDs
 33 |  
 34 |  - "There's no goroutine IDs, so I can't kill my threads"
 35 |    + This is what channels are for: just tell your thread via a channel to shut itself off
 36 |    + Also, it's kind of "antisocial" to kill them.
 37 |      - What we mean is that your program is prolly not gonna work very well if you keep killing your threads like that
 38 | 
 39 | ### Channels vs. Mutexes
 40 | 
 41 |  - if you need a mutex, use a mutex
 42 |  - if you need condition variable, think about using a channel instead
 43 |  - don't communicate by sharing memory, you share memory by communicating
 44 | 
 45 | ### Network channels
 46 | 
 47 |  - it'd be great to have the equivalent for a network channel
 48 |  - if you take local abstractions (like channels) and use them in a new
 49 |    context like a network, ignoring failure modes (etc), then you're gonna
 50 |    run into trouble
 51 | 
 52 | Scale of engineering efforts
 53 | ----------------------------
 54 | 
 55 | In 2011, Google had:
 56 | 
 57 |  - 5000+ developers 
 58 |  - 20+ changes per minute
 59 |  - 50% code base changes every month (files? not lines probably)
 60 |  - 50 million test cases executed per day
 61 |  - single code tree projects
 62 | 
 63 | A new language was needed to fix the problems that other languages had with software engineering at this scale
 64 | 
 65 | The scale of compilation matters. 
 66 |  - When you compile a package A that depends on B, most (all?) languages need to compile B first
 67 |  - Go doesn't.
 68 |  - Dependencies like these at the scale of Google projects slow down compilation if you use a traditional language
 69 |    + gets worse with "deeper" dependencies `A->B->C->D->...`
 70 |  - _Example:_ at some point they found a postscript interpreter compiled in a server binary for no reason due to weird deps
 71 | 
 72 | ### Interfaces vs. inheritance
 73 | 
 74 |  - inhertance hierarchies are hard to get right and if you don't they are hard to change later
 75 |  - interfaces are much more informal and clearer about who owns and supplies what parts of the program
 76 | 
 77 | ### Readability and simplicity
 78 | 
 79 |  - Dick Gabriel quote: 
 80 |  > "I'm always delighted by the light touch and stillness of early programming languages. Not much text; a lot gets done. Old programs read like quiet conversations between a well-spoken research worker and a well-studied mechanical colleague, not as a debate with a compiler. Who'd have guessed sophistication bought such noise?"
 81 |  - Simplify syntax
 82 |  - Avoid cleverness: ternary operators, macros
 83 |  - Don't let code writing be like "arguing with your compiler"
 84 |  - Don't want to puzzle through code 6 months later
 85 | 
 86 | Design criteria
 87 | ---------------
 88 | 
 89 |  - started by Rob Pike, Robert Griesemer and Ken Thompson in late 2007
 90 |  - Russ Cox, Ian Lance Taylor joined in mid-2008
 91 |  - design by consensus (everyone could veto a feature, if they didn't want it)
 92 | 
 93 | ### Generics
 94 | 
 95 |  - Russ: "Don't use `*list.List`, you almost never need them. Use slices."
 96 |    + Generics are not bad, just hard to do right.
 97 |      - Early designers for Java generics also agreed and warned Go designers to be careful
 98 |        + Seems like they regretted getting into that business
 99 | 
100 | ### Enginering tools
101 | 
102 |  - when you have millions of lines of code, you need mechanical help
103 |    + like changing an API
104 |  - Go designed to be easy to parse (not like C++)
105 |  - standard formatter
106 |  - Means you can't tell a mechanical change from a manual change
107 |    + enables automated rewrites of code
108 | 
109 | ### More automation
110 | 
111 |  - fix code for API updates
112 |    + early Go versions API changed a lot
113 |    + Google had a rewriter that would fix your code which used the changed APIs
114 |  - renaming struct fields, variables w/ conflict resolution
115 |  - moving packages
116 |  - splitting of packages
117 |  - code cleanup
118 |  - change C code to Go
119 |  - global analysis that figure out what are all the implementors of an interface for instance
120 | 
121 | State of Go
122 | -----------
123 | 
124 |  - Go 1.4 released in Decembeer 2014
125 |  - Go 1.5 has toolchain implemented in Go, not in C
126 |    + concurrent GC
127 |    + Go for mobile devices
128 |    + Go on PowerPC, ARM64
129 |  - Lots of people use it
130 |  - Go conferences outside of Google/Go
131 | 
132 | Q&A
133 | ---
134 | 
135 |  - Go vs C/C++
136 |    + Go is garbage collected, biggest difference, so slower
137 |    + Go can be faster than Java sometimes
138 |    + once you're aware of that, you can write code that
139 |      runs faster than C/C++ code
140 |    + no reason that code that doesn't allocate memory
141 |      shouldn't run as fast as C/C++
142 |  - Goal to use Go outside Google?
143 |    + Yes! Otherwise the language would die?
144 |    + You get a breadth of experts that give you advice and write tools, etc.
145 |      - C++ memory model guy gave feedback on Go memory model
146 |        + Very usefl
147 |    + Not trying to replace anything like language X
148 |      - but they were using C/C++ and didn't want to anymore
149 |      - however Python and Ruby users are switching to Go more
150 |        + Go feels just as light but statically type checked
151 |  - Studies about benefits of Go?
152 |    + not a lot of data collected
153 | 


--------------------------------------------------------------------------------
/l19-hubspot.html:
--------------------------------------------------------------------------------
  1 | <h1>6.824 2015 Lecture 19: HubSpot</h1>
  2 | 
  3 | <p><strong>Note:</strong> These lecture notes were slightly modified from the ones posted on the
  4 | 6.824 <a href="http://nil.csail.mit.edu/6.824/2015/schedule.html">course website</a> from 
  5 | Spring 2015.</p>
  6 | 
  7 | <h2>Distributed systems in the real world</h2>
  8 | 
  9 | <p>Who builds distributed systems:</p>
 10 | 
 11 | <ul>
 12 | <li>SaaS market
 13 | <ul>
 14 | <li>Startups: CustomMade, Instagram, HubSpot</li>
 15 | <li>Mature: Akamai, Facebook, Twitter</li>
 16 | </ul></li>
 17 | <li>Enterprise market
 18 | <ul>
 19 | <li>Startup: Basho (Riak), Infinio, Hadapt</li>
 20 | <li>Mature: VMWare, Vertica</li>
 21 | </ul></li>
 22 | <li>...and graduate students</li>
 23 | </ul>
 24 | 
 25 | <p>High-level components:</p>
 26 | 
 27 | <ul>
 28 | <li>front-end: load balancing routers</li>
 29 | <li>handlers, caching, storage, business services</li>
 30 | <li>infra-services: logging, updates, authentication</li>
 31 | </ul>
 32 | 
 33 | <p>Low-level components:</p>
 34 | 
 35 | <ul>
 36 | <li>RPCs (semantics, failure)</li>
 37 | <li>coordination (consensus, Paxos)</li>
 38 | <li>persistence (serialization semantics)</li>
 39 | <li>caching</li>
 40 | <li>abstractions (queues, jobs, workflows)</li>
 41 | </ul>
 42 | 
 43 | <h2>Building the thing</h2>
 44 | 
 45 | <p>Business needs will affect scale and architecture</p>
 46 | 
 47 | <ul>
 48 | <li>dating website core data: OkCupid uses 2 beefy database servers</li>
 49 | <li>analytics distributed DB: Vertica/Netezza clusters have around 100 nodes</li>
 50 | <li>mid-size SaaS company: HubSpot uses around 100 single-node DBs or around
 51 | 10 node HBase clusters
 52 | <ul>
 53 | <li>MySQL mostly</li>
 54 | </ul></li>
 55 | <li>Akamai, Facebook, Amazon: tens of thousands of machines</li>
 56 | </ul>
 57 | 
 58 | <p>Small SaaS startup:</p>
 59 | 
 60 | <ul>
 61 | <li>early on the best thing is to figure out if you have a good idea that people
 62 | would buy</li>
 63 | <li>typically use a platform like Heroku, Google App Engine, AWS, Joyent, CloudFoundry</li>
 64 | </ul>
 65 | 
 66 | <p>Midsized SaaS:</p>
 67 | 
 68 | <ul>
 69 | <li>need more control than what PaaS offers</li>
 70 | <li>scale may enable you to build better solutions more cheaply</li>
 71 | <li>open source solutions can help you</li>
 72 | </ul>
 73 | 
 74 | <p>Mature SaaS:</p>
 75 | 
 76 | <ul>
 77 | <li><a href="http://aphyr.com/tags/jepsen">Jepsen tool</a></li>
 78 | <li>"Ensure your design works if scale changes by 10x or 20x; the right solution
 79 | for x often not optimal for 100x", Jeff Dean</li>
 80 | </ul>
 81 | 
 82 | <p>How to think about your design:</p>
 83 | 
 84 | <ul>
 85 | <li>understand what your system needs to do and the semantics</li>
 86 | <li>understand workload scale then estimate (L2 access time, network latency) and
 87 | plan to understand performance</li>
 88 | </ul>
 89 | 
 90 | <h2>Running the thing</h2>
 91 | 
 92 | <ul>
 93 | <li>"telemetry beats event logging"
 94 | <ul>
 95 | <li>logs can be hard to understand: getting a good story out is difficult</li>
 96 | </ul></li>
 97 | <li>logging: first line of defense, doesn't scale well
 98 | <ul>
 99 | <li>logs on different machines</li>
100 | <li>what if timestamps are useless because clocks are not synced</li>
101 | <li>lots of tools around logging</li>
102 | <li>having log data in queryable format tends to be very useful </li>
103 | </ul></li>
104 | <li>monitoring, telemetry, alerting
105 | <ul>
106 | <li>annotate code with timing and counting events</li>
107 | <li>measure how big a memory queue is or how long a request takes and
108 | you can count it</li>
109 | <li>can do telemetry at multiple granularities so we can break long requests
110 | into smaller pieces and pinpoint problems</li>
111 | </ul></li>
112 | </ul>
113 | 
114 | <h2>Management: command and control</h2>
115 | 
116 | <ul>
117 | <li>in classroom settings you don't have to set up a bunch of machines</li>
118 | <li>as your business scales new machines need to be set up => must automate</li>
119 | <li>separate configuration from app</li>
120 | <li>HubSpot uses a ZooKeeper like system that allows apps to get config values</li>
121 | <li>Maven for dependencies in Java</li>
122 | <li>Jenkins for continuous integration testing</li>
123 | </ul>
124 | 
125 | <h2>Testing</h2>
126 | 
127 | <ul>
128 | <li>automated testing makes it easy to verify newly introduced changes to your code</li>
129 | <li>UI testing can be a little harder (simulate clicks, different layout in different browsers)
130 | <ul>
131 | <li>front end changes => must change tests?</li>
132 | </ul></li>
133 | </ul>
134 | 
135 | <h2>Teams</h2>
136 | 
137 | <ul>
138 | <li>people: how do you get together and build the thing</li>
139 | <li>analogy: software engineering process is sort of like a distributed system
140 | with unreliable components.
141 | <ul>
142 | <li>somehow must build reliable software on a reliable schedule</li>
143 | </ul></li>
144 | <li>gotta take care of your people: culture has to be amenable to people growing,
145 | learning and failing</li>
146 | </ul>
147 | 
148 | <h2>Process</h2>
149 | 
150 | <ul>
151 | <li>waterfall: big design upfront and then implement it</li>
152 | <li>agile/scrum: don't know the whole solution, need to iterate on designs</li>
153 | <li>kanban:</li>
154 | <li>lean:</li>
155 | </ul>
156 | 
157 | <h2>Questions</h2>
158 | 
159 | <ul>
160 | <li>making a big change on fast changing code base
161 | <ul>
162 | <li>if you branch and then merge your changes, chances are the codebase has
163 | changed drastically</li>
164 | <li>you can try to have two different branches deployed such that the new
165 | branch can be tested in production</li>
166 | </ul></li>
167 | <li>culture changes with growth
168 | <ul>
169 | <li>need to pay attention to culture and happiness of employees</li>
170 | <li>very important to measure happiness</li>
171 | <li>having small teams might help because people can own projects</li>
172 | </ul></li>
173 | </ul>
174 | 


--------------------------------------------------------------------------------
/l19-hubspot.md:
--------------------------------------------------------------------------------
  1 | 6.824 2015 Lecture 19: HubSpot
  2 | ==============================
  3 | 
  4 | **Note:** These lecture notes were slightly modified from the ones posted on the
  5 | 6.824 [course website](http://nil.csail.mit.edu/6.824/2015/schedule.html) from 
  6 | Spring 2015.
  7 | 
  8 | Distributed systems in the real world
  9 | -------------------------------------
 10 | 
 11 | Who builds distributed systems:
 12 |  
 13 |  + SaaS market
 14 |    - Startups: CustomMade, Instagram, HubSpot
 15 |    - Mature: Akamai, Facebook, Twitter
 16 |  + Enterprise market
 17 |    - Startup: Basho (Riak), Infinio, Hadapt
 18 |    - Mature: VMWare, Vertica
 19 |  + ...and graduate students
 20 | 
 21 | High-level components:
 22 |  
 23 |  - front-end: load balancing routers
 24 |  - handlers, caching, storage, business services
 25 |  - infra-services: logging, updates, authentication
 26 | 
 27 | Low-level components:
 28 | 
 29 |  - RPCs (semantics, failure)
 30 |  - coordination (consensus, Paxos)
 31 |  - persistence (serialization semantics)
 32 |  - caching
 33 |  - abstractions (queues, jobs, workflows)
 34 | 
 35 | Building the thing
 36 | ------------------
 37 | 
 38 | Business needs will affect scale and architecture
 39 | 
 40 |  - dating website core data: OkCupid uses 2 beefy database servers
 41 |  - analytics distributed DB: Vertica/Netezza clusters have around 100 nodes
 42 |  - mid-size SaaS company: HubSpot uses around 100 single-node DBs or around
 43 |    10 node HBase clusters
 44 |    + MySQL mostly
 45 |  - Akamai, Facebook, Amazon: tens of thousands of machines
 46 | 
 47 | Small SaaS startup:
 48 | 
 49 |  - early on the best thing is to figure out if you have a good idea that people
 50 |    would buy
 51 |  - typically use a platform like Heroku, Google App Engine, AWS, Joyent, CloudFoundry
 52 | 
 53 | Midsized SaaS:
 54 | 
 55 |  - need more control than what PaaS offers
 56 |  - scale may enable you to build better solutions more cheaply
 57 |  - open source solutions can help you
 58 | 
 59 | Mature SaaS:
 60 | 
 61 |  - [Jepsen tool](http://aphyr.com/tags/jepsen)
 62 |  - "Ensure your design works if scale changes by 10x or 20x; the right solution
 63 |     for x often not optimal for 100x", Jeff Dean
 64 | 
 65 | How to think about your design:
 66 | 
 67 |  - understand what your system needs to do and the semantics
 68 |  - understand workload scale then estimate (L2 access time, network latency) and
 69 |    plan to understand performance
 70 | 
 71 | Running the thing
 72 | -----------------
 73 | 
 74 |  - "telemetry beats event logging"
 75 |    + logs can be hard to understand: getting a good story out is difficult
 76 |  - logging: first line of defense, doesn't scale well
 77 |    + logs on different machines
 78 |    + what if timestamps are useless because clocks are not synced
 79 |    + lots of tools around logging
 80 |    + having log data in queryable format tends to be very useful 
 81 |  - monitoring, telemetry, alerting
 82 |    + annotate code with timing and counting events
 83 |    + measure how big a memory queue is or how long a request takes and
 84 |      you can count it
 85 |    + can do telemetry at multiple granularities so we can break long requests
 86 |      into smaller pieces and pinpoint problems
 87 | 
 88 | Management: command and control
 89 | -------------------------------
 90 | 
 91 |  - in classroom settings you don't have to set up a bunch of machines
 92 |  - as your business scales new machines need to be set up => must automate
 93 |  - separate configuration from app
 94 |  - HubSpot uses a ZooKeeper like system that allows apps to get config values
 95 |  - Maven for dependencies in Java
 96 |  - Jenkins for continuous integration testing
 97 | 
 98 | Testing
 99 | -------
100 | 
101 |  - automated testing makes it easy to verify newly introduced changes to your code
102 |  - UI testing can be a little harder (simulate clicks, different layout in different browsers)
103 |    + front end changes => must change tests?
104 | 
105 | Teams
106 | -----
107 | 
108 |  - people: how do you get together and build the thing
109 |  - analogy: software engineering process is sort of like a distributed system
110 |    with unreliable components.
111 |    + somehow must build reliable software on a reliable schedule
112 |  - gotta take care of your people: culture has to be amenable to people growing,
113 |    learning and failing
114 | 
115 | Process
116 | -------
117 | 
118 |  - waterfall: big design upfront and then implement it
119 |  - agile/scrum: don't know the whole solution, need to iterate on designs
120 |  - kanban:
121 |  - lean:
122 | 
123 | Questions
124 | ---------
125 | 
126 |  - making a big change on fast changing code base
127 |    + if you branch and then merge your changes, chances are the codebase has
128 |      changed drastically
129 |    + you can try to have two different branches deployed such that the new
130 |      branch can be tested in production
131 |  - culture changes with growth
132 |    + need to pay attention to culture and happiness of employees
133 |    + very important to measure happiness
134 |    + having small teams might help because people can own projects
135 | 


--------------------------------------------------------------------------------
/l23-bitcoin.html:
--------------------------------------------------------------------------------
  1 | <h1>6.824 2015 Lecture 23: Bitcoin</h1>
  2 | 
  3 | <p><strong>Note:</strong> These lecture notes were slightly modified from the ones posted on the
  4 | 6.824 <a href="http://nil.csail.mit.edu/6.824/2015/schedule.html">course website</a> from 
  5 | Spring 2015.</p>
  6 | 
  7 | <h2>Bitcoin</h2>
  8 | 
  9 | <ul>
 10 | <li>an electronic currency system</li>
 11 | <li>has a technical side and a financial, economic, social side</li>
 12 | <li>maybe the 1st thing to ask: is it trying to do something better? is there a
 13 | problem it solves for us?</li>
 14 | <li>online payments use credit cards, why not just use them?
 15 | <ul>
 16 | <li>Pluses:
 17 | <ul>
 18 | <li>They work online</li>
 19 | <li>Hard for people to steal my credit card (there are laws about how credit
 20 | card companies work so that if your number is stolen, you are protected)</li>
 21 | </ul></li>
 22 | <li>Good/Bad:
 23 | <ul>
 24 | <li>Customer service # on the back allows you to reverse charges
 25 | <ul>
 26 | <li>this can prevent or create fraud</li>
 27 | </ul></li>
 28 | <li>tied to some country's currency</li>
 29 | </ul></li>
 30 | <li>Minuses
 31 | <ul>
 32 | <li>No way for me as a customer or a merchant to independently verify anything
 33 | about a credit card transaction: do you have money, is the CC # valid?
 34 | <ul>
 35 | <li>it can be good if you don't want people finding out how much money
 36 | you have</li>
 37 | </ul></li>
 38 | <li>relies on 3rd parties: great way to charge fees on everything</li>
 39 | <li>3% fees</li>
 40 | <li>settling time is quite long (merchants are not sure they are getting their
 41 | money until after one month)</li>
 42 | <li>pretty hard to become a credit card merchant
 43 | <ul>
 44 | <li>credit card companies take a lot of risk by sending money to merchants who
 45 | might not send products to customers, resulting in the credit card
 46 | company having to refund customers</li>
 47 | </ul></li>
 48 | </ul></li>
 49 | </ul></li>
 50 | <li>For Bitcoin:
 51 | <ul>
 52 | <li>no 3rd parties are needed (well, not really true anymore)</li>
 53 | <li>fees are much smaller than 3%</li>
 54 | <li>the settling time is maybe 10 minutes</li>
 55 | <li>anyone can become a merchant</li>
 56 | </ul></li>
 57 | <li>Bitcoin makes the sequence of transactions verifiable by everyone and agree
 58 | on it <code>=&gt;</code> no need to rely on 3rd parties</li>
 59 | </ul>
 60 | 
 61 | <h2>OneBit</h2>
 62 | 
 63 | <ul>
 64 | <li>simple electronic money system</li>
 65 | <li>it has one server called OneBank</li>
 66 | <li>each user owns some coins</li>
 67 | </ul>
 68 | 
 69 | <p>Design:</p>
 70 | 
 71 | <pre><code>OneBank server
 72 | </code></pre>
 73 | 
 74 | <ul>
 75 | <li>onebit xction:
 76 | <ol>
 77 | <li>public key of new owner</li>
 78 | <li>a hash of the last transfer record of this coin </li>
 79 | <li>a signature done over this record by the private key of last owner</li>
 80 | </ol></li>
 81 | <li>bank keeps the list of transactions for each coin</li>
 82 | <li><code>x</code> transfer the coin to <code>y</code></li>
 83 | <li><code>[T7: from=x, to=y; hash=h(prev tx); sig_x(this)]</code></li>
 84 | <li><code>y</code> transfers the coin to <code>z</code>, gets a hamburger from McDonalds</li>
 85 | <li><code>[T8: from y, to=z; hash=h(T7); sig_y(this)]</code></li>
 86 | <li>what can go wrong?
 87 | <ul>
 88 | <li>if someone transfers a coin to <code>z</code> it seems very unlikely that anyone else
 89 | other than <code>z</code> can spend that coin: because no one else can sign a new
 90 | transaction with that coin since they don't have <code>z</code>'s private key</li>
 91 | </ul></li>
 92 | <li>we have to trust one bank to not let users double spend money
 93 | <ul>
 94 | <li><code>y</code> can also buy a milkshake from Burger King with that same coin if the bank
 95 | helps him</li>
 96 | <li><code>[T8': from y, to=q'; hash=h(T7); sig_y(this)]</code></li>
 97 | <li>the bank can show T8 to McDonalds and T8' to Burget King</li>
 98 | <li>(I love free food!)</li>
 99 | <li>as long as McDonalds and Burger King don't talk to each other and verify
100 | the transaction chain, they won't detect it</li>
101 | </ul></li>
102 | </ul>
103 | 
104 | <h2>Bitcoin block chain</h2>
105 | 
106 | <ul>
107 | <li>bitcoin has a single block chain</li>
108 | <li>many server: more or less replicas, have copy of entire block chain</li>
109 | <li>each block in the block chain looks like this:
110 | <ul>
111 | <li>hash of previous block</li>
112 | <li>set of transactions</li>
113 | <li>nonce</li>
114 | <li>current time</li>
115 | </ul></li>
116 | <li>xactions have two stages
117 | <ul>
118 | <li>first it is created and sent out to the network</li>
119 | <li>then the transaction is incorporated into the block chain</li>
120 | </ul></li>
121 | </ul>
122 | 
123 | <h3>How are blocks created? Mining</h3>
124 | 
125 | <p>All of the peers in the bitcoin network try to create the next block:</p>
126 | 
127 | <ul>
128 | <li>each peer takes all transactions that have arrived since the previous block
129 | was created and try to append a new block with them</li>
130 | <li>the rules say that a hash of a block has to be less than a certain number
131 | (i.e. it has a # of leading of zeros, making it hard to find)</li>
132 | <li>each of the bitcoin peers adjust the <code>nonce</code> field in the block until they
133 | get a hash with a certain # of leading zeros</li>
134 | <li>the point of this is to make it expensive to create new blocks
135 | <ul>
136 | <li>for a single computer it might take months to find such a nonce</li>
137 | </ul></li>
138 | <li>the # of leading zeros is adjusted so that on average it takes 10 minutes for
139 | a new block to be added
140 | <ul>
141 | <li>clients monitor the <code>currentTime</code> field in the last 5 transactions or so
142 | and if they took to little time, they add another zero to # of target zeros
143 | <ul>
144 | <li>everyone obeys the protocol because if they don't the others will either
145 | reject their block (say if it has the wrong # of zeros or a wrong timestamp)</li>
146 | </ul></li>
147 | </ul></li>
148 | </ul>
149 | 
150 | <h3>The empty block chain</h3>
151 | 
152 | <ul>
153 | <li>"In the beginning there was nothing, and then Satoshi created the first block."</li>
154 | <li>"And then people started mining additional blocks, with no transactions."</li>
155 | <li>"And then they got mining reward for each mined block."</li>
156 | <li>"And that's how users got Bitcoins."</li>
157 | <li>"And then they started doing transactions."</li>
158 | <li>"And then there was light."</li>
159 | </ul>
160 | 
161 | <h3>What does it take to double spend</h3>
162 | 
163 | <p>If a tx is in the block chain, can the system double spend its coins?</p>
164 | 
165 | <ul>
166 | <li>forking the block chain is the only way to do this</li>
167 | <li>can the forks be hidden for long?  </li>
168 | <li>if forks happens, miners will pick either one and continue mining</li>
169 | <li>when a fork gets longer, everyone switches to it
170 | <ul>
171 | <li>if they stay on the shorter fork, they are likely to be outmined by the others
172 | and waste work, so they will have incentive to go on the longer one</li>
173 | <li>the tx's on the shorter fork get incorporated in the longer one</li>
174 | <li>committed tx's can get undone => people usually wait for a few extra blocks
175 | to be created after a tx's block</li>
176 | </ul></li>
177 | <li>this is where the 51% rule comes in: if 51% of the computing power is honest
178 | the protocol works correctly</li>
179 | <li>if more than 51% are dishonest, then they'll likely succeed in mining anything
180 | they want</li>
181 | <li>probably the most clever thing about bitcoin: as long as you believe than more
182 | than half the computing power is not cheating, you can be sure there's no double
183 | spending</li>
184 | </ul>
185 | 
186 | <h3>Good and bad parts of design</h3>
187 | 
188 | <ul>
189 | <li>(+) publicly verifiable log</li>
190 | <li>(-) tied to a new currency and it is very volatile
191 | <ul>
192 | <li>lots of people don't use it for this reason</li>
193 | </ul></li>
194 | <li>(+/-) mining-decentralized trust</li>
195 | </ul>
196 | 
197 | <p>Hard to say what will happen:</p>
198 | 
199 | <ul>
200 | <li>we could be all using it in 30 years</li>
201 | <li>or, banks could catch up, and come up with their own verifiable log design</li>
202 | </ul>
203 | 


--------------------------------------------------------------------------------
/l23-bitcoin.md:
--------------------------------------------------------------------------------
  1 | 6.824 2015 Lecture 23: Bitcoin
  2 | ==============================
  3 | 
  4 | **Note:** These lecture notes were slightly modified from the ones posted on the
  5 | 6.824 [course website](http://nil.csail.mit.edu/6.824/2015/schedule.html) from 
  6 | Spring 2015.
  7 | 
  8 | Bitcoin
  9 | -------
 10 | 
 11 |  - an electronic currency system
 12 |  - has a technical side and a financial, economic, social side
 13 |  - maybe the 1st thing to ask: is it trying to do something better? is there a
 14 |    problem it solves for us?
 15 |  - online payments use credit cards, why not just use them?
 16 |    + Pluses:
 17 |      + They work online
 18 |      + Hard for people to steal my credit card (there are laws about how credit
 19 |        card companies work so that if your number is stolen, you are protected)
 20 |    + Good/Bad:
 21 |      - Customer service # on the back allows you to reverse charges
 22 |          + this can prevent or create fraud
 23 |      - tied to some country's currency
 24 |    - Minuses
 25 |      - No way for me as a customer or a merchant to independently verify anything
 26 |        about a credit card transaction: do you have money, is the CC # valid?
 27 |          + it can be good if you don't want people finding out how much money
 28 |            you have
 29 |      - relies on 3rd parties: great way to charge fees on everything
 30 |      - 3% fees
 31 |      - settling time is quite long (merchants are not sure they are getting their
 32 |        money until after one month)
 33 |      - pretty hard to become a credit card merchant
 34 |          - credit card companies take a lot of risk by sending money to merchants who
 35 |            might not send products to customers, resulting in the credit card
 36 |            company having to refund customers
 37 |  - For Bitcoin:
 38 |    + no 3rd parties are needed (well, not really true anymore)
 39 |    + fees are much smaller than 3%
 40 |    + the settling time is maybe 10 minutes
 41 |    + anyone can become a merchant
 42 |  - Bitcoin makes the sequence of transactions verifiable by everyone and agree
 43 |    on it `=>` no need to rely on 3rd parties
 44 | 
 45 | 
 46 | OneBit
 47 | ------
 48 | 
 49 |  - simple electronic money system
 50 |  - it has one server called OneBank
 51 |  - each user owns some coins
 52 | 
 53 | Design:
 54 | 
 55 |     OneBank server
 56 | 
 57 |  - onebit xction:
 58 |    1. public key of new owner
 59 |    2. a hash of the last transfer record of this coin 
 60 |    3. a signature done over this record by the private key of last owner
 61 |  - bank keeps the list of transactions for each coin
 62 |  - `x` transfer the coin to `y`
 63 |  - `[T7: from=x, to=y; hash=h(prev tx); sig_x(this)]`
 64 |  - `y` transfers the coin to `z`, gets a hamburger from McDonalds
 65 |  - `[T8: from y, to=z; hash=h(T7); sig_y(this)]`
 66 |  - what can go wrong?
 67 |    + if someone transfers a coin to `z` it seems very unlikely that anyone else
 68 |      other than `z` can spend that coin: because no one else can sign a new
 69 |      transaction with that coin since they don't have `z`'s private key
 70 |  - we have to trust one bank to not let users double spend money
 71 |    + `y` can also buy a milkshake from Burger King with that same coin if the bank
 72 |      helps him
 73 |    + `[T8': from y, to=q'; hash=h(T7); sig_y(this)]`
 74 |    + the bank can show T8 to McDonalds and T8' to Burget King
 75 |    + (I love free food!)
 76 |    + as long as McDonalds and Burger King don't talk to each other and verify
 77 |      the transaction chain, they won't detect it
 78 | 
 79 | Bitcoin block chain
 80 | -------------------
 81 | 
 82 |  - bitcoin has a single block chain
 83 |  - many server: more or less replicas, have copy of entire block chain
 84 |  - each block in the block chain looks like this:
 85 |    + hash of previous block
 86 |    + set of transactions
 87 |    + nonce
 88 |    + current time
 89 |  - xactions have two stages
 90 |    + first it is created and sent out to the network
 91 |    + then the transaction is incorporated into the block chain
 92 | 
 93 | ### How are blocks created? Mining
 94 | 
 95 | All of the peers in the bitcoin network try to create the next block:
 96 | 
 97 |  - each peer takes all transactions that have arrived since the previous block
 98 |    was created and try to append a new block with them
 99 |  - the rules say that a hash of a block has to be less than a certain number
100 |    (i.e. it has a # of leading of zeros, making it hard to find)
101 |  - each of the bitcoin peers adjust the `nonce` field in the block until they
102 |    get a hash with a certain # of leading zeros
103 |  - the point of this is to make it expensive to create new blocks
104 |    + for a single computer it might take months to find such a nonce
105 |  - the # of leading zeros is adjusted so that on average it takes 10 minutes for
106 |    a new block to be added
107 |    + clients monitor the `currentTime` field in the last 5 transactions or so
108 |      and if they took to little time, they add another zero to # of target zeros
109 |      - everyone obeys the protocol because if they don't the others will either
110 |        reject their block (say if it has the wrong # of zeros or a wrong timestamp)
111 | 
112 | ### The empty block chain
113 | 
114 |  - "In the beginning there was nothing, and then Satoshi created the first block."
115 |  - "And then people started mining additional blocks, with no transactions."
116 |  - "And then they got mining reward for each mined block."
117 |  - "And that's how users got Bitcoins."
118 |  - "And then they started doing transactions."
119 |  - "And then there was light."
120 | 
121 | ### What does it take to double spend
122 | 
123 | If a tx is in the block chain, can the system double spend its coins?
124 | 
125 |  - forking the block chain is the only way to do this
126 |  - can the forks be hidden for long?  
127 |  - if forks happens, miners will pick either one and continue mining
128 |  - when a fork gets longer, everyone switches to it
129 |    + if they stay on the shorter fork, they are likely to be outmined by the others
130 |      and waste work, so they will have incentive to go on the longer one
131 |    + the tx's on the shorter fork get incorporated in the longer one
132 |    + committed tx's can get undone => people usually wait for a few extra blocks
133 |      to be created after a tx's block
134 |  - this is where the 51% rule comes in: if 51% of the computing power is honest
135 |    the protocol works correctly
136 |  - if more than 51% are dishonest, then they'll likely succeed in mining anything
137 |    they want
138 |  - probably the most clever thing about bitcoin: as long as you believe than more
139 |    than half the computing power is not cheating, you can be sure there's no double
140 |    spending
141 | 
142 | ### Good and bad parts of design
143 | 
144 |  - (+) publicly verifiable log
145 |  - (-) tied to a new currency and it is very volatile
146 |    + lots of people don't use it for this reason
147 |  - (+/-) mining-decentralized trust
148 | 
149 | Hard to say what will happen:
150 | 
151 |  - we could be all using it in 30 years
152 |  - or, banks could catch up, and come up with their own verifiable log design
153 | 


--------------------------------------------------------------------------------
/lab2/lab-2a-vs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/lab2/lab-2a-vs.png


--------------------------------------------------------------------------------
/lab4/notes.html:
--------------------------------------------------------------------------------
  1 | <h1>Lab 4: Part A</h1>
  2 | 
  3 | <h2>Details</h2>
  4 | 
  5 | <p>Partition/shard keys over a set of replica groups. Each replica group handles
  6 | puts and gets for a # of shards. Groups operate in parllel => higher system
  7 | throughput.</p>
  8 | 
  9 | <p>Components:</p>
 10 | 
 11 | <ul>
 12 | <li>a set of replica groups
 13 | <ul>
 14 | <li>each replica group is responsible for a subset of the shards</li>
 15 | </ul></li>
 16 | <li>a shardmaster
 17 | <ul>
 18 | <li>decides which replica group should serve each shard</li>
 19 | <li>configuration changes over time</li>
 20 | <li>clients contact shard master, find replica group</li>
 21 | <li>replica groups consult the master, to find out what shards to serve</li>
 22 | <li>single-service, replicated using Paxos</li>
 23 | </ul></li>
 24 | </ul>
 25 | 
 26 | <p>All replica group members must agree an whether Get/Put happened before/after a reconfiguration <code>=&gt;</code> store Put/Get/Append + reconfigurations in Paxos log</p>
 27 | 
 28 | <p>Reasonable to assume each replica group is always available (because of Paxos replication) <code>=&gt;</code> simpler than primary/backup replication when primary goes down and still thinks it's primary</p>
 29 | 
 30 | <h2>Shardmaster</h2>
 31 | 
 32 | <ul>
 33 | <li>manages a <em>sequence of numbered configurations</em>
 34 | <ul>
 35 | <li><code>config = set of {replica group}, assignment of shards to {replica group}</code></li>
 36 | </ul></li>
 37 | <li>RPC interface
 38 | <ul>
 39 | <li><code>Join(gid, servers)</code>
 40 | <ul>
 41 | <li>takes a replica group ID and an array of servers for that group</li>
 42 | <li>adds the new replica group </li>
 43 | <li>rebalances the shards across all replicas</li>
 44 | <li>returns a new configuration that includes the new replica group</li>
 45 | </ul></li>
 46 | <li><code>Leave(gid)</code>
 47 | <ul>
 48 | <li>takes a replica group ID</li>
 49 | <li>removes that replica group</li>
 50 | <li>rebalances the shards across all remaining replicas</li>
 51 | </ul></li>
 52 | <li><code>Move(shardno, gid)</code>
 53 | <ul>
 54 | <li>takes a shard # and a replica group ID</li>
 55 | <li>reassigns the shard from its current replica group to the specified 
 56 | replica group</li>
 57 | <li>subsequent <code>Join</code>'s or <code>Leave</code>'s can undo the work done by <code>Move</code>
 58 | because they rebalance</li>
 59 | </ul></li>
 60 | <li><code>Query(configno)</code>
 61 | <ul>
 62 | <li>returns the configuration with that number</li>
 63 | <li>if <code>configno == -1</code> or <code>configno</code> is bigger than the biggest known
 64 | config number, then return the latest configuration</li>
 65 | <li><code>Query(-1)</code> should reflect every Join, Leave or Move that completed before
 66 | the <code>Query(-1)</code> RPC was sent</li>
 67 | </ul></li>
 68 | </ul></li>
 69 | <li>rebalancing should divide the shards as evenly as possible among the
 70 | groups and move as few shards (not data?) as possible in the process
 71 | <ul>
 72 | <li><code>=&gt;</code> only move shard from one group to another "wisely"</li>
 73 | </ul></li>
 74 | <li><strong>No need for duplicate detection</strong>, in practice you would need to!</li>
 75 | <li>the first configuration has #0, contains <em>no groups</em>, all shards assigned
 76 | to GID 0 (an invalid GID)</li>
 77 | <li>typically much more shards than groups</li>
 78 | </ul>
 79 | 
 80 | <h2>Hints</h2>
 81 | 
 82 | <h1>Lab 4: Part B</h1>
 83 | 
 84 | <h2>Notes</h2>
 85 | 
 86 | <p>We supply you with client.go code that sends each RPC to the replica group responsible for the RPC's key. It re-tries if the replica group says it is not responsible for the key; in that case, the client code asks the shard master for the latest configuration and tries again. You'll have to modify client.go as part of your support for dealing with duplicate client RPCs, much as in the kvpaxos lab.</p>
 87 | 
 88 | <p><strong>TODO:</strong> Xid's across different replica groups? How do those work? We can execute
 89 | an op on one replica group and be told "wrong" replica, when we take that op
 90 | to another group we don't ever want to be told "duplicate op", just because
 91 | we talked to another replica.</p>
 92 | 
 93 | <h2>Plan</h2>
 94 | 
 95 | <p>Clients's transaction ID (xid) should be <code>&lt;clerkID, shardNo, seqNo&gt;</code>, where <code>seqNo</code>
 96 | autoincrements, so that when we transfer shards from one group to another, the xids <br />
 97 | of the ops for the transferred shards will not conflict with existing xids on
 98 | the other group.</p>
 99 | 
100 | <p>When configuration doesn't change, things stay simple, even when servers go down:</p>
101 | 
102 | <ul>
103 | <li>clients find out which GID to contact</li>
104 | <li>clients send Op to GID</li>
105 | <li>GID agree on Op using paxos
106 | <ul>
107 | <li>if a GID server goes down, that's fine, we have paxos </li>
108 | </ul></li>
109 | </ul>
110 | 
111 | <h2>Hints</h2>
112 | 
113 | <p><strong>Hint:</strong> your server will need to periodically check with the shardmaster to
114 | see if there's a new configuration; do this in <code>tick()</code>.</p>
115 | 
116 | <p><strong>TODO:</strong> If there was a configuration change, could we have picked it up too late?
117 | What if we serviced requests?</p>
118 | 
119 | <p><strong>Hint:</strong> you should have a function whose job it is to examine recent entries
120 | in the Paxos log and apply them to the state of the shardkv server. Don't
121 | directly update the stored key/value database in the Put/Append/Get handlers;
122 | instead, attempt to append a Put, Append, or Get operation to the Paxos log, and
123 | then call your log-reading function to find out what happened (e.g., perhaps a
124 | reconfiguration was entered in the log just before the Put/Append/Get).</p>
125 | 
126 | <p><strong>TODO:</strong> Right now I only applyLog when I receive a Get. Gotta be sure I can 
127 | reconfigure in the middle of a <code>Get</code> request.</p>
128 | 
129 | <p><strong>Hint:</strong> your server should respond with an <code>ErrWrongGroup</code> error to a client RPC
130 | with a key that the server isn't responsible for (i.e. for a key whose shard is
131 | not assigned to the server's group). Make sure your Get/Put/Append handlers make
132 | this decision correctly in the face of a concurrent re-configuration.</p>
133 | 
134 | <ul>
135 | <li>seems like you can only check what shard you are responsible for at log apply
136 | time</li>
137 | <li><code>=&gt;</code> ops must wait</li>
138 | </ul>
139 | 
140 | <p><strong>Hint:</strong> process re-configurations one at a time, in order.</p>
141 | 
142 | <p><strong>Hint:</strong> during re-configuration, replica groups will have to send each other
143 | the keys and values for some shards.</p>
144 | 
145 | <p><strong>TODO:</strong> What if servers go down during this? Can I still agree on ops during this? Seems like it.</p>
146 | 
147 | <ul>
148 | <li>maybe we can share the keys and values via the log?</li>
149 | </ul>
150 | 
151 | <p><strong>Hint:</strong> When the test fails, check for gob error (e.g. "rpc: writing response:
152 | gob: type not registered for interface ...") in the log because go doesn't
153 | consider the error fatal, although it is fatal for the lab.</p>
154 | 
155 | <p><strong>Hint:</strong> Be careful about implementing at-most-once semantic for RPC. When a
156 | server sends shards to another, the server needs to send the clients state as
157 | well. Think about how the receiver of the shards should update its own clients
158 | state. Is it ok for the receiver to replace its clients state with the received
159 | one?</p>
160 | 
161 | <p><strong>TODO:</strong> What is this client state?? Is it the XIDs associated with the log ops?
162 | I think they mean the lastXid[clerkID] map. Servers in G1 could have lastXid[c, shard] = i
163 | and servers in G2 could have lastXid[c, shard] = j. </p>
164 | 
165 | <p><strong>Hint:</strong> Think about how should the shardkv client and server deal with
166 | ErrWrongGroup. Should the client change the sequence number if it receives
167 | ErrWrongGroup? Should the server update the client state if it returns
168 | ErrWrongGroup when executing a Get/Put request?</p>
169 | 
170 | <p><strong>TODO:</strong> This gets to my question from the "Notes" section...</p>
171 | 
172 | <p><strong>Hint:</strong> After a server has moved to a new view, it can leave the shards that
173 | it is not owning in the new view undeleted. This will simplify the server
174 | implementation.</p>
175 | 
176 | <p><strong>Hint:</strong> Think about when it is ok for a server to give shards to the other
177 | server during view change.</p>
178 | 
179 | <p><strong>TODO:</strong> Before applying the new configuration change?</p>
180 | 
181 | <h2>Algorithm</h2>
182 | 


--------------------------------------------------------------------------------
/lab4/notes.md:
--------------------------------------------------------------------------------
  1 | Lab 4: Part A
  2 | =============
  3 | 
  4 | Details
  5 | -------
  6 | 
  7 | Partition/shard keys over a set of replica groups. Each replica group handles
  8 | puts and gets for a # of shards. Groups operate in parllel => higher system
  9 | throughput.
 10 | 
 11 | Components:
 12 |  
 13 |  - a set of replica groups
 14 |    + each replica group is responsible for a subset of the shards
 15 |  - a shardmaster
 16 |    + decides which replica group should serve each shard
 17 |    + configuration changes over time
 18 |    + clients contact shard master, find replica group
 19 |    + replica groups consult the master, to find out what shards to serve
 20 |    + single-service, replicated using Paxos
 21 | 
 22 | All replica group members must agree an whether Get/Put happened before/after a reconfiguration `=>` store Put/Get/Append + reconfigurations in Paxos log
 23 | 
 24 | Reasonable to assume each replica group is always available (because of Paxos replication) `=>` simpler than primary/backup replication when primary goes down and still thinks it's primary
 25 | 
 26 | Shardmaster
 27 | -----------
 28 | 
 29 |  - manages a _sequence of numbered configurations_
 30 |    + `config = set of {replica group}, assignment of shards to {replica group}`
 31 |  - RPC interface
 32 |    + `Join(gid, servers)`
 33 |       - takes a replica group ID and an array of servers for that group
 34 |       - adds the new replica group 
 35 |       - rebalances the shards across all replicas
 36 |       - returns a new configuration that includes the new replica group
 37 |    + `Leave(gid)`
 38 |       - takes a replica group ID
 39 |       - removes that replica group
 40 |       - rebalances the shards across all remaining replicas
 41 |    + `Move(shardno, gid)`
 42 |       - takes a shard # and a replica group ID
 43 |       - reassigns the shard from its current replica group to the specified 
 44 |         replica group
 45 |       - subsequent `Join`'s or `Leave`'s can undo the work done by `Move`
 46 |         because they rebalance
 47 |    + `Query(configno)`
 48 |       - returns the configuration with that number
 49 |       - if `configno == -1` or `configno` is bigger than the biggest known
 50 |         config number, then return the latest configuration
 51 |       - `Query(-1)` should reflect every Join, Leave or Move that completed before
 52 |         the `Query(-1)` RPC was sent
 53 |  - rebalancing should divide the shards as evenly as possible among the
 54 |    groups and move as few shards (not data?) as possible in the process
 55 |    + `=>` only move shard from one group to another "wisely"
 56 |  - **No need for duplicate detection**, in practice you would need to!
 57 |  - the first configuration has #0, contains _no groups_, all shards assigned
 58 |    to GID 0 (an invalid GID)
 59 |  - typically much more shards than groups
 60 | 
 61 | Hints
 62 | -----
 63 | 
 64 | 
 65 | Lab 4: Part B
 66 | =============
 67 | 
 68 | Notes
 69 | -----
 70 | 
 71 | We supply you with client.go code that sends each RPC to the replica group responsible for the RPC's key. It re-tries if the replica group says it is not responsible for the key; in that case, the client code asks the shard master for the latest configuration and tries again. You'll have to modify client.go as part of your support for dealing with duplicate client RPCs, much as in the kvpaxos lab.
 72 | 
 73 | **TODO:** Xid's across different replica groups? How do those work? We can execute
 74 | an op on one replica group and be told "wrong" replica, when we take that op
 75 | to another group we don't ever want to be told "duplicate op", just because
 76 | we talked to another replica.
 77 | 
 78 | Plan
 79 | ----
 80 | 
 81 | Clients's transaction ID (xid) should be `<clerkID, shardNo, seqNo>`, where `seqNo`
 82 | autoincrements, so that when we transfer shards from one group to another, the xids  
 83 | of the ops for the transferred shards will not conflict with existing xids on
 84 | the other group.
 85 | 
 86 | When configuration doesn't change, things stay simple, even when servers go down:
 87 | 
 88 |  - clients find out which GID to contact
 89 |  - clients send Op to GID
 90 |  - GID agree on Op using paxos
 91 |    + if a GID server goes down, that's fine, we have paxos 
 92 | 
 93 | Hints
 94 | -----
 95 | 
 96 | **Hint:** your server will need to periodically check with the shardmaster to
 97 | see if there's a new configuration; do this in `tick()`.
 98 | 
 99 | **TODO:** If there was a configuration change, could we have picked it up too late?
100 | What if we serviced requests?
101 | 
102 | **Hint:** you should have a function whose job it is to examine recent entries
103 | in the Paxos log and apply them to the state of the shardkv server. Don't
104 | directly update the stored key/value database in the Put/Append/Get handlers;
105 | instead, attempt to append a Put, Append, or Get operation to the Paxos log, and
106 | then call your log-reading function to find out what happened (e.g., perhaps a
107 | reconfiguration was entered in the log just before the Put/Append/Get).
108 | 
109 | **TODO:** Right now I only applyLog when I receive a Get. Gotta be sure I can 
110 | reconfigure in the middle of a `Get` request.
111 | 
112 | **Hint:** your server should respond with an `ErrWrongGroup` error to a client RPC
113 | with a key that the server isn't responsible for (i.e. for a key whose shard is
114 | not assigned to the server's group). Make sure your Get/Put/Append handlers make
115 | this decision correctly in the face of a concurrent re-configuration.
116 | 
117 |  - seems like you can only check what shard you are responsible for at log apply
118 |    time
119 |  - `=>` ops must wait
120 | 
121 | **Hint:** process re-configurations one at a time, in order.
122 | 
123 | **Hint:** during re-configuration, replica groups will have to send each other
124 | the keys and values for some shards.
125 | 
126 | **TODO:** What if servers go down during this? Can I still agree on ops during this? Seems like it.
127 | 
128 |  - maybe we can share the keys and values via the log?
129 | 
130 | **Hint:** When the test fails, check for gob error (e.g. "rpc: writing response:
131 | gob: type not registered for interface ...") in the log because go doesn't
132 | consider the error fatal, although it is fatal for the lab.
133 | 
134 | **Hint:** Be careful about implementing at-most-once semantic for RPC. When a
135 | server sends shards to another, the server needs to send the clients state as
136 | well. Think about how the receiver of the shards should update its own clients
137 | state. Is it ok for the receiver to replace its clients state with the received
138 | one?
139 | 
140 | **TODO:** What is this client state?? Is it the XIDs associated with the log ops?
141 | I think they mean the lastXid[clerkID] map. Servers in G1 could have lastXid[c, shard] = i
142 | and servers in G2 could have lastXid[c, shard] = j. 
143 | 
144 | **Hint:** Think about how should the shardkv client and server deal with
145 | ErrWrongGroup. Should the client change the sequence number if it receives
146 | ErrWrongGroup? Should the server update the client state if it returns
147 | ErrWrongGroup when executing a Get/Put request?
148 | 
149 | **TODO:** This gets to my question from the "Notes" section...
150 | 
151 | **Hint:** After a server has moved to a new view, it can leave the shards that
152 | it is not owning in the new view undeleted. This will simplify the server
153 | implementation.
154 | 
155 | **Hint:** Think about when it is ok for a server to give shards to the other
156 | server during view change.
157 | 
158 | **TODO:** Before applying the new configuration change?
159 | 
160 | Algorithm
161 | ---------
162 | 


--------------------------------------------------------------------------------
/lab5/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
  2 | <html>
  3 | <head>
  4 | <link rel="StyleSheet" href="style.css" type="text/css">
  5 | <title>6.824 Lab 5: Persistence</title>
  6 | </head>
  7 | 
  8 | <body>
  9 | <div align="center">
 10 | <h2><a href="../index.html">6.824</a> - Spring 2015</h2>
 11 | </div>
 12 | 
 13 | <div align="center">
 14 | <h1>6.824 Lab 5: Persistence</h1>
 15 | </div>
 16 | 
 17 | 
 18 | <div align="center">
 19 | <h3>Due: May 8 11:59pm</h3>
 20 | 
 21 | </div>
 22 | 
 23 | <hr>
 24 | 
 25 | <h3>Introduction</h3>
 26 | 
 27 | <p>
 28 | In this lab you'll add persistence to your key/value server. The
 29 | overall goal is to be able to recover after the crash and restart of
 30 | one or more key/value servers. It's this capability that makes
 31 | fault-tolerance really valuable! The specific properties you'll need
 32 | to ensure are:
 33 | 
 34 | <ul>
 35 | 
 36 | <li>If a key/value server crashes (halts cleanly with disk intact),
 37 | and is re-started, it should re-join its replica group. The effect on
 38 | availability of one or more such crashed key/value servers should be
 39 | no worse than if the same servers had been temporarily disconnected
 40 | from the network rather than crashing.  This ability to re-start
 41 | requires that each replica 
 42 | save its key/value database, Paxos state, and any other needed state
 43 | to disk where it can find it after the re-start.
 44 | 
 45 | <li>If a key/value server crashes (halts cleanly) and loses its disk
 46 | contents, and is re-started, it should acquire a key/value database
 47 | and other needed state from the other replicas and re-join its
 48 | replica group. If a majority of a replica group simultaneously loses
 49 | disk contents, the group cannot ever continue. If a minority simultaneously
 50 | lose their disk content, and re-start, the group must recover so that
 51 | it can tolerate future crashes.
 52 | 
 53 | </ul>
 54 | 
 55 | <p>
 56 | You do not need to design a high-performance format for the on-disk
 57 | data. It is sufficient for a server to store each key/value pair in a
 58 | separate file, and to use a few more files to store its other state.
 59 | 
 60 | <p>
 61 | You do not need to add persistence to the shardmaster. The tester
 62 | uses your existing <tt>shardmaster</tt> package.
 63 | 
 64 | <p>
 65 | This lab requires more thought than you might think.
 66 | 
 67 | <p>
 68 | You may find 
 69 | <a href="http://research.google.com/archive/paxos_made_live.html">Paxos
 70 | Made Live</a> useful, particularly Section 5.1. Harp may also be worth
 71 | reviewing, since it pays special attention to recovery from various
 72 | crash and disk-loss failures.
 73 | 
 74 | <p>
 75 | You should either do Lab 5, or a <a href="../project.html">project</a>, but not both.
 76 | 
 77 | <h3>Collaboration Policy</h3>
 78 | 
 79 | You must write all the code you hand in for 6.824, except for code
 80 | that we give you as part of the assignment. You are not allowed to
 81 | look at anyone else's solution, and you are not allowed to look at
 82 | code from previous years. You may discuss the assignments with other
 83 | students, but you may not look at or copy each others' code. Please do
 84 | not publish your code or make it available to future 6.824 students --
 85 | for example, please do not make your code visible on github.
 86 | 
 87 | <h3>Software</h3>
 88 | 
 89 | <p>
 90 | Do a <tt>git pull</tt> to get the latest lab software. We supply you
 91 | with new skeleton code and new tests in <tt>src/diskv</tt>.
 92 | 
 93 | <pre>
 94 | $ add 6.824
 95 | $ cd ~/6.824
 96 | $ git pull
 97 | ...
 98 | $ cd src/diskv
 99 | $
100 | </pre>
101 | 
102 | <h3>Getting Started</h3>
103 | 
104 | <p>
105 | First merge a copy of your Lab 4 code into <tt>diskv/server.go</tt>,
106 | <tt>common.go</tt>, and <tt>client.go</tt>. Be careful when merging
107 | StartServer(), since it's a bit different from Lab 4. And don't
108 | copy <tt>test_test.go</tt>; Lab 5 has a new set of tests.
109 | 
110 | <p>
111 | There are a few differences between the Lab 4 and Lab 5 frameworks.
112 | First, StartServer() takes an extra <tt>dir</tt> argument
113 | that tells a key/value server the directory in which it should store
114 | its state (key/value pairs, Paxos state, etc.). A server should only
115 | use files under that directory; it should not use any other files. The
116 | tests give a server the same directory name each time the tests
117 | re-start a given server. StartServer() can tell if it has been re-started
118 | (as opposed to started for the first time) by looking at its
119 | <tt>restart</tt> argument. The tests give each server a different
120 | directory.
121 | 
122 | <p>
123 | The second big framework difference is that the Lab 5 tests run each
124 | key/value server as a separate UNIX process, rather than as a set of
125 | threads in a single process. <tt>main/diskvd.go</tt> is the <tt>main()</tt>
126 | routine for the key/value server process. The tester runs
127 | <tt>diskvd.go</tt> as a separate program, and <tt>diskvd.go</tt> calls
128 | StartServer().
129 | 
130 | <p>
131 | After merging your Lab 4 code into <tt>diskv</tt>, you should be able
132 | to pass the tests that print (lab4). These are copies of Lab 4 tests.
133 | 
134 | <h3>Hints</h3>
135 | 
136 | <p>
137 | If a server crashes, loses its disk, and re-starts, a potential
138 | problem is that it could participate in Paxos instances that it had
139 | participated in before crashing. Since the server has lost its Paxos
140 | state, it won't participate correctly in these instances. So you must
141 | find a way to ensure that servers that re-join after disk loss only
142 | participate in new instances.
143 | 
144 | <p>
145 | <tt>diskv/server.go</tt> includes some functions that may be helpful
146 | to you when reading and writing files containing key/value data.
147 | 
148 | <p>
149 | You may want to use Go's 
150 | <a href="http://golang.org/pkg/encoding/gob/">gob</a> package to
151 | format and parse saved state.
152 | Here's an <a href="lab-5-gob.go">example</a>.
153 | As with RPC, if you want to encode structs with gob,
154 | you must capitalize the field names.
155 | 
156 | <p>
157 | The Lab 5 tester will kill key/value servers so that they stop
158 | executing at a random place, which was not the case in previous labs.
159 | One consequence is that, if your server is writing a file, the tester
160 | might kill it midway through writing the file (much as a real crash
161 | might occur while writing a file). A good way to cause replacement of
162 | a whole file to nevertheless be atomic is to write to a temporary file
163 | in the same directory, and then
164 | call <tt>os.Rename(tempname,realname)</tt>.
165 | 
166 | <p>
167 | You'll probably have to modify your Paxos implementation, at least so
168 | that it's possible to save and restore a key/value server's Paxos
169 | state.
170 | 
171 | <p>
172 | Don't run multiple instances of the Lab 5 tests at the same time on
173 | the same machine. They will remove each others' files.
174 | 
175 | <h3>Handin procedure</h3>
176 | 
177 | <p>Submit your code via the class's submission website, located here:
178 | 
179 | <p><a href="https://6824.scripts.mit.edu:444/submit/handin.py/">https://6824.scripts.mit.edu:444/submit/handin.py/</a>
180 | 
181 | <p>You may use your MIT Certificate or request an API key via email to
182 | log in for the first time.
183 | Your API key (XXX) is displayed once you logged in, which can
184 | be used to upload lab4 from the console as follows.
185 | 
186 | <pre>
187 | $ cd ~/6.824
188 | $ echo XXX > api.key
189 | $ make lab5
190 | </pre>
191 | 
192 | You can check the submission website to check if your submission is successful.
193 | 
194 | <p>You will receive full credit if your software passes
195 | the <tt>test_test.go</tt> tests when we run your software on our
196 | machines.  We will use the timestamp of your <strong>last</strong>
197 | submission for the purpose of calculating late days.
198 | 
199 | <hr>
200 | 
201 | <address>
202 | Please post questions on <a href="http://piazza.com">Piazza</a>.
203 | <p>
204 | 
205 | </address>
206 | 
207 |  </body>
208 |  </html>
209 | 
210 | <!--  LocalWords:  Paxos Sharded shard sharding sharded Put's src shardmaster
211 |  -->
212 | <!--  LocalWords:  shardkv cd TestBasic TestUnreliable Go's RPCs RPC's GID px
213 |  -->
214 | <!--  LocalWords:  kvpaxos Config ErrWrongGroup Handin gzipped czvf whoami tgz
215 |  -->
216 | <!--  LocalWords:  else's github diskv StartServer dir diskvd os API
217 |  -->
218 | <!--  LocalWords:  tempname realname api timestamp
219 |  -->
220 | 


--------------------------------------------------------------------------------
/original-notes/l02-rpc.txt:
--------------------------------------------------------------------------------
  1 | 6.824 2015 Lecture 2: Infrastructure: RPC and threads
  2 | 
  3 | Remote Procedure Call (RPC)
  4 |   a key piece of distrib sys machinery; all the labs use RPC
  5 |   goal: easy-to-program network communication
  6 |     hides most details of client/server communication
  7 |     client call is much like ordinary procedure call
  8 |     server handlers are much like ordinary procedures
  9 |   RPC is widely used!
 10 | 
 11 | RPC ideally makes net communication look just like fn call:
 12 |   Client:
 13 |     z = fn(x, y)
 14 |   Server:
 15 |     fn(x, y) {
 16 |       compute
 17 |       return z
 18 |     }
 19 |   RPC aims for this level of transparency
 20 | 
 21 | Examples from lab 1:
 22 |   DoJob
 23 |   Register
 24 | 
 25 | RPC message diagram:
 26 |   Client             Server
 27 |     request--->
 28 |        <---response
 29 | 
 30 | Software structure
 31 |   client app         handlers
 32 |     stubs           dispatcher
 33 |    RPC lib           RPC lib
 34 |      net  ------------ net
 35 |  
 36 | A few details:
 37 |   Which server function (handler) to call?
 38 |   Marshalling: format data into packets
 39 |     Tricky for arrays, pointers, objects, &c
 40 |     Go's RPC library is pretty powerful!
 41 |     some things you cannot pass: e.g., channels, functions
 42 |   Binding: how does client know who to talk to?
 43 |     Maybe client supplies server host name
 44 |     Maybe a name service maps service names to best server host
 45 |   Threads:
 46 |     Client often has many threads, so > 1 call outstanding, match up replies
 47 |     Handlers may be slow, so server often runs each in a thread
 48 | 
 49 | RPC problem: what to do about failures?
 50 |   e.g. lost packet, broken network, slow server, crashed server
 51 | 
 52 | What does a failure look like to the client RPC library?
 53 |   Client never sees a response from the server
 54 |   Client does *not* know if the server saw the request!
 55 |     Maybe server/net failed just before sending reply
 56 |   [diagram of lost reply]
 57 | 
 58 | Simplest scheme: "at least once" behavior
 59 |   RPC library waits for response for a while
 60 |   If none arrives, re-send the request
 61 |   Do this a few times
 62 |   Still no response -- return an error to the application
 63 | 
 64 | Q: is "at least once" easy for applications to cope with?
 65 | 
 66 | Simple problem w/ at least once:
 67 |   client sends "deduct $10 from bank account"
 68 | 
 69 | Q: what can go wrong with this client program?
 70 |   Put("k", 10) -- an RPC to set key's value in a DB server
 71 |   Put("k", 20) -- client then does a 2nd Put to same key
 72 |   [diagram, timeout, re-send, original arrives very late]
 73 | 
 74 | Q: is at-least-once ever OK?
 75 |   yes: if it's OK to repeat operations, e.g. read-only op
 76 |   yes: if application has its own plan for coping w/ duplicates
 77 |     which you will need for Lab 1
 78 | 
 79 | Better RPC behavior: "at most once"
 80 |   idea: server RPC code detects duplicate requests
 81 |     returns previous reply instead of re-running handler
 82 |   Q: how to detect a duplicate request?
 83 |   client includes unique ID (XID) with each request
 84 |     uses same XID for re-send
 85 |   server:
 86 |     if seen[xid]:
 87 |       r = old[xid]
 88 |     else
 89 |       r = handler()
 90 |       old[xid] = r
 91 |       seen[xid] = true
 92 | 
 93 | some at-most-once complexities
 94 |   this will come up in labs 2 and on
 95 |   how to ensure XID is unique?
 96 |     big random number?
 97 |     combine unique client ID (ip address?) with sequence #?
 98 |   server must eventually discard info about old RPCs
 99 |     when is discard safe?
100 |     idea:
101 |       unique client IDs
102 |       per-client RPC sequence numbers
103 |       client includes "seen all replies <= X" with every RPC
104 |       much like TCP sequence #s and acks
105 |     or only allow client one outstanding RPC at a time
106 |       arrival of seq+1 allows server to discard all <= seq
107 |     or client agrees to keep retrying for < 5 minutes
108 |       server discards after 5+ minutes
109 |   how to handle dup req while original is still executing?
110 |     server doesn't know reply yet; don't want to run twice
111 |     idea: "pending" flag per executing RPC; wait or ignore
112 | 
113 | What if an at-most-once server crashes and re-starts?
114 |   if at-most-once duplicate info in memory, server will forget
115 |     and accept duplicate requests after re-start
116 |   maybe it should write the duplicate info to disk?
117 |   maybe replica server should also replicate duplicate info?
118 | 
119 | What about "exactly once"?
120 |   at-most-once plus unbounded retries plus fault-tolerant service
121 |   Lab 3
122 | 
123 | Go RPC is "at-most-once"
124 |   open TCP connection
125 |   write request to TCP connection
126 |   TCP may retransmit, but server's TCP will filter out duplicates
127 |   no retry in Go code (i.e. will NOT create 2nd TCP connection)
128 |   Go RPC code returns an error if it doesn't get a reply
129 |     perhaps after a timeout (from TCP)
130 |     perhaps server didn't see request
131 |     perhaps server processed request but server/net failed before reply came back
132 | 
133 | Go RPC's at-most-once isn't enough for Lab 1
134 |   it only applies to a single RPC call
135 |   if worker doesn't respond, the master re-send to it to another worker
136 |     but original worker may have not failed, and is working on it too
137 |   Go RPC can't detect this kind of duplicate
138 |     No problem in lab 1, which handles at application level
139 |     Lab 2 will explicitly detect duplicates
140 | 
141 | Threads
142 |   threads are a fundamental server structuring tool
143 |   you'll use them a lot in the labs
144 |   they can be tricky
145 |   useful with RPC 
146 |   Go calls them goroutines; everyone else calls them threads
147 | 
148 | Thread = "thread of control"
149 |   threads allow one program to (logically) do many things at once
150 |   the threads share memory
151 |   each thread includes some per-thread state:
152 |     program counter, registers, stack
153 | 
154 | Threading challenges:
155 |   sharing data 
156 |      two threads modify the same variable at same time?
157 |      one thread reads data that another thread is changing?
158 |      these problems are often called races
159 |      need to protect invariants on shared data
160 |      use Go sync.Mutex
161 |   coordination between threads
162 |     e.g. wait for all Map threads to finish
163 |     use Go channels
164 |   deadlock 
165 |      thread 1 is waiting for thread 2
166 |      thread 2 is waiting for thread 1
167 |      easy detectable (unlike races)
168 |   lock granularity
169 |      coarse-grained -> simple, but little concurrency/parallelism
170 |      fine-grained -> more concurrency, more races and deadlocks
171 |   let's look at a toy RPC package to illustrate these problems
172 | 
173 | look at today's handout -- l-rpc.go
174 |   it's a simplified RPC system
175 |   illustrates threads, mutexes, channels
176 |   it's a toy, though it does run
177 |     assumes connection already open
178 |     only supports an integer arg, integer reply
179 |     omits error checks
180 | 
181 | struct ToyClient
182 |   client RPC state 
183 |   mutex per ToyClient
184 |   connection to server (e.g. TCP socket)
185 |   xid -- unique ID per call, to match reply to caller
186 |   pending[] -- chan per thread waiting in Call()
187 |     so client knows what to do with each arriving reply
188 | 
189 | Call
190 |   application calls reply := client.Call(procNum, arg)
191 |   procNum indicates what function to run on server
192 |   WriteRequest knows the format of an RPC msg
193 |     basically just the arguments turned into bits in a packet
194 |   Q: why the mutex in Call()? what does mu.Lock() do?
195 |   Q: could we move "xid := tc.xid" outside the critical section?
196 |      after all, we are not changing anything
197 |      [diagram to illustrate]
198 |   Q: do we need to WriteRequest inside the critical section?
199 |   note: Go says you are responsible for preventing concurrent map ops
200 |     that's one reason the update to pending is locked
201 | 
202 | Listener
203 |   runs as a background thread
204 |   what is <- doing?
205 |   not quite right that it may need to wait on chan for caller
206 | 
207 | Back to Call()...
208 | 
209 | Q: what if reply comes back very quickly?
210 |    could Listener() see reply before pending[xid] entry exists?
211 |    or before caller is waiting for channel?
212 | 
213 | Q: should we put reply:=<-done inside the critical section?
214 |    why is it OK outside? after all, two threads use it.
215 | 
216 | Q: why mutex per ToyClient, rather than single mutex per whole RPC pkg?
217 | 
218 | Server's Dispatcher()
219 |   note that the Dispatcher echos the xid back to the client
220 |     so that Listener knows which Call to wake up
221 |   Q: why run the handler in a separate thread?
222 |   Q: is it a problem that the dispatcher can reply out of order?
223 | 
224 | main()
225 |   note registering handler in handlers[] 
226 |   what will the program print?
227 | 
228 | Q: when to use channels vs shared memory + locks?
229 |   here is my opinion
230 |   use channels when you want one thread to explicitly wait for another
231 |     often wait for a result, or wait for the next request
232 |     e.g. when client Call() waits for Listener()
233 |   use shared memory and locks when the threads are not intentionally
234 |     directly interacting, but just happen to r/w the same data
235 |     e.g. when Call() uses tc.xid
236 |   but: they are fundamentally equivalent; either can always be used.
237 | 
238 | Go's "memory model" requires explicit synchronization to communicate!
239 |   This code is not correct:
240 |     var x int
241 |     done := false
242 |     go func() { x = f(...); done = true }
243 |     while done == false { }
244 |   it's very tempting to write, but the Go spec says it's undefined
245 |   use a channel or sync.WaitGroup instead
246 | 
247 | Study the Go tutorials on goroutines and channels
248 | 


--------------------------------------------------------------------------------
/original-notes/l03-remus.txt:
--------------------------------------------------------------------------------
  1 | 6.824 2015 Lecture 3: Primary/Backup Replication
  2 | 
  3 | Today
  4 |   Replication
  5 |   Remus case study
  6 |   Lab 2 introduction
  7 | 
  8 | Fault tolerance
  9 |   we'd like a service that continues despite failures!
 10 |   available: still useable despite [some class of] failures
 11 |   correct: act just like a single server to clients
 12 |   very hard!
 13 |   very useful!
 14 | 
 15 | Need a failure model: what will we try to cope with?
 16 |   Independent fail-stop computer failure
 17 |     Remus further assumes only one failure at a time
 18 |   Site-wide power failure (and eventual reboot)
 19 |   (Network partition)
 20 |   No bugs, no malice
 21 | 
 22 | Core idea: replication
 23 |   *Two* servers (or more)
 24 |   Each replica keeps state needed for the service
 25 |   If one replica fails, others can continue
 26 | 
 27 | Example: fault-tolerant MapReduce master
 28 |   lab 1 workers are already fault-tolerant, but not master
 29 |     master is a "single point of failure"
 30 |   can we have two masters, in case one fails?
 31 |   [diagram: M1, M2, workers]
 32 |   state:
 33 |     worker list
 34 |     which jobs done
 35 |     which workers idle
 36 |     TCP connection state
 37 |     program counter
 38 | 
 39 | Big Questions:
 40 |   What state to replicate?
 41 |   How does replica get state?
 42 |   When to cut over to backup?
 43 |   Are anomalies visible at cut-over?
 44 |   How to repair / re-integrate?
 45 | 
 46 | Two main approaches:
 47 |   State transfer
 48 |     "Primary" replica executes the service
 49 |     Primary sends [new] state to backups
 50 |   Replicated state machine
 51 |     All replicas execute all operations
 52 |     If same start state,
 53 |       same operations,
 54 |       same order,
 55 |       deterministic,
 56 |       then same end state
 57 | 
 58 | State transfer is simpler
 59 |   But state may be large, slow to transfer
 60 |   Remus uses state transfer
 61 | 
 62 | Replicated state machine can be more efficient
 63 |   If operations are small compared to data
 64 |   But complex, e.g. order on multi-core, determinism
 65 |   Labs use replicated state machines
 66 | 
 67 | Remus: High Availability via Asynchronous Virtual Machine Replication
 68 | NSDI 2008
 69 | 
 70 | Very ambitious system:
 71 |   Whole-system replication
 72 |   Completely transparent to applications and clients
 73 |   High availability for any existing software
 74 |   Would be magic if it worked well!
 75 |   Failure model:
 76 |     1. independent hardware faults
 77 |     2. site-wide power failure
 78 | 
 79 | Plan 1 (slow, broken):
 80 |   [diagram: app, O/S, Remus underneath]
 81 |   two machines, primary and backup; plus net and other machines
 82 |   primary runs o/s and application s/w, talks to clients, &c
 83 |   backup does *not* initially execute o/s, applications, &c
 84 |     it only executes some Remus code
 85 |   a few times per second:
 86 |     pause primary
 87 |     copy entire RAM, registers, disk to backup
 88 |     resume primary
 89 |   if primary fails:
 90 |     start backup executing!
 91 | 
 92 | Q: is Plan 1 correct?
 93 |    i.e. does it look just like a single reliable server?
 94 | 
 95 | Q: what will outside world see if primary fails and replica takes over?
 96 |    will backup have same state as last visible on primary?
 97 |    might a client request be lost? executed twice?
 98 | 
 99 | Q: is Plan 1 efficient?
100 | 
101 | Can we eliminate the fact that backup *state* trails the primary?
102 |   Seems very hard!
103 |   Primary would have to tell backup (and wait) on every instruction.
104 | 
105 | Can we *conceal* the fact that backup's state lags primary?
106 |   Prevent outside world from *seeing* that backup is behind last primary state
107 |     e.g. prevent primary sent RPC reply but backup state doesn't reflect that RPC
108 |     e.g. MR Register RPC, which it would be bad for backup to forget
109 |   Idea: primary "holds" output until backup state catches up to output point
110 |     e.g. primary receives RPC request, processes it, creates reply packet,
111 |     but Remus holds reply packet until backup has received corresponding state update
112 | 
113 | Remus epochs, checkpoints
114 |   Clients:    C1
115 |                req1                       reply1
116 |   Primary:    ... E1 ... | pause |  E2  release   | pause |
117 |                            ckpt        ok           ckpt
118 |   Backup:      ... (E0) ... |  apply  | (E1)         | 
119 |   1. Primary runs for a while in Epoch 1, holding E1's output
120 |   2. Primary pauses
121 |   3. Primary sends RAM+disk changes to backup (in background)
122 |   4. Primary resumes execution in E2, holding E2's output
123 |   5. Backup copies all to separate RAM, then ACKs
124 |   6. Primary releases E1's output
125 |   7. Backup applies E1's changes to RAM and disk
126 | 
127 | If primary fails, backup finishes applying last epoch's disk+RAM,
128 |   then starts executing
129 | 
130 | Q: any externally visible anomalies?
131 |    lost input/output?
132 |    repeated output?
133 | 
134 | Q: what if primary receives+executes a request, crashes before checkpoint?
135 |    backup won't have seen request!
136 | 
137 | Q: what if primary crashes after sending ckpt to backup,
138 |    but before releasing output?
139 | 
140 | Q: what if client doesn't use TCP -- doesn't re-transmit?
141 | 
142 | Q: what if primary fails while sending state to backup?
143 |    i.e. backup is mid-way through absorbing new state?
144 | 
145 | Q: are there situations in which Remus will incorrectly activate the backup?
146 |    i.e. primary is actually alive
147 |    network partition...
148 | 
149 | Q: when primary recovers, how does Remus restore replication?
150 |    needed, since eventually active ex-backup will itself fail
151 | 
152 | Q: what if *both* fail, e.g. site-wide power failure?
153 |    RAM content will be lost, but disks will probably survive
154 |    after power is restored, reboot guest from one of the disks
155 |      O/S and application recovery code will execute
156 |    disk must be "crash-consistent"
157 |      so probably not the backup disk if was in middle of installing checkpoint
158 |    disk shouldn't reflect any held outputs (... why not?)
159 |      so probably not the primary's disk if was executing
160 |    I do not understand this part of the paper (Section 2.5)
161 |      seems to be a window during which neither disk could be used if power failed
162 |        primary writes its disk during epoch
163 |        meanwhile backup applies last epoch's writes to its disk
164 | 
165 | Q: in what situations will Remus likely have good performance?
166 | 
167 | Q: in what situations will Remus likely have low performance?
168 | 
169 | Q: should epochs be short or long?
170 | 
171 | Remus Evaluation
172 |   summary: 1/2 to 1/4 native speed
173 |   checkpoints are big and take time to send
174 |   output hold limits speed at which clients can interact
175 | 
176 | Why so slow?
177 |   checkpoints are big and take time to generate and send
178 |     100ms for SPECweb2005 -- because many pages written
179 |   so inter-checkpoint intervals must be long
180 |   so output must be held for quite a while
181 |   so client interactions are slow
182 |     only 10 RPCs per second per client
183 | 
184 | How could one get better performance for replication?
185 |   big savings possible with application-specific schemes:
186 |     just send state really needed by application, not all state
187 |     send state in optimized format, not whole pages
188 |     send operations if they are smaller than state
189 |   likely *not* transaparent to applications
190 |     and probably not to clients either
191 | 
192 | PRIMARY-BACKUP REPLICATION IN LAB 2
193 | 
194 | outline:
195 |   simple key/value database
196 |     Get(k), Put(k, v), Append(k, v)
197 |   primary and backup
198 |   replicate by primary sending each operation to backups
199 |   tolerate network problems, including partition
200 |     either keep going, correctly
201 |     or suspend operations until network is repaired
202 |   allow replacement of failed servers
203 |   you implement essentially all of this (unlike lab 1)
204 | 
205 | "view server" decides who p and b are
206 |   main goal: avoid "split brain" -- disagreement about who primary is
207 |   clients and servers ask view server
208 |   they don't make independent decisions
209 | 
210 | repair:
211 |   view server can co-opt "idle" server as b after old b becomes p
212 |   primary initializes new backup's state
213 | 
214 | key points:
215 |   1. only one primary at a time!
216 |   2. the primary must have the latest state!
217 |   we will work out some rules to ensure these
218 | 
219 | view server
220 |   maintains a sequence of "views"
221 |     view #, primary, backup
222 |     0: -- --
223 |     1: S1 --
224 |     2: S1 S2
225 |     4: S2 --
226 |     3: S2 S3
227 |   monitors server liveness
228 |     each server periodically sends a Ping RPC
229 |     "dead" if missed N Pings in a row
230 |     "live" after single Ping
231 |   can be more than two servers Pinging view server
232 |     if more than two, "idle" servers
233 |   if primary is dead
234 |     new view with previous backup as primary
235 |   if backup is dead, or no backup
236 |     new view with previously idle server as backup
237 |   OK to have a view with just a primary, and no backup
238 |     but -- if an idle server is available, make it the backup
239 | 
240 | how to ensure new primary has up-to-date replica of state?
241 |   only promote previous backup
242 |   i.e. don't make an idle server the primary
243 |   backup must remember if it has been initialized by primary
244 |     if not, don't function as primary even if promoted!
245 | 
246 | Q: can more than one server think it is primary?
247 |    1: S1, S2
248 |       net broken, so viewserver thinks S1 dead but it's alive
249 |    2: S2, --
250 |    now S1 alive and not aware of view #2, so S1 still thinks it is primary
251 |    AND S2 alive and thinks it is primary
252 |    => split brain, no good
253 | 
254 | how to ensure only one server acts as primary?
255 |   even though more than one may *think* it is primary
256 |   "acts as" == executes and responds to client requests
257 |   the basic idea:
258 |     1: S1 S2
259 |     2: S2 --
260 |     S1 still thinks it is primary
261 |     S1 must forward ops to S2
262 |     S2 thinks S2 is primary
263 |     so S2 must reject S1's forwarded ops
264 | 
265 | the rules:
266 |   1. primary in view i must have been primary or backup in view i-1 
267 |   2. if you think you are primary, must wait for backup for each request
268 |   3. if you think you are not backup, reject forwarded requests
269 |   4. if you think you are not primary, reject direct client requests
270 | 
271 | so:
272 |   before S2 hears about view #2
273 |     S1 can process ops from clients, S2 will accept forwarded requests
274 |     S2 will reject ops from clients who have heard about view #2
275 |   after S2 hears about view #2
276 |     if S1 receives client request, it will forward, S2 will reject
277 |       so S1 can no longer act as primary
278 |     S1 will send error to client, client will ask vs for new view,
279 |        client will re-send to S2
280 |   the true moment of switch-over occurs when S2 hears about view #2
281 | 
282 | how can new backup get state?
283 |   e.g. all the keys and values
284 |   if S2 is backup in view i, but was not in view i-1,
285 |     S2 should ask primary to transfer the complete state
286 | 
287 | rule for state transfer:
288 |   every operation (Put/Get/Append) must be either before or after state xfer
289 |     == state xfer must be atomic w.r.t. operations
290 |   either
291 |     op is before, and xferred state reflects op
292 |     op is after, xferred state doesn't reflect op, prim forwards op after state
293 | 
294 | Q: does primary need to forward Get()s to backup?
295 |    after all, Get() doesn't change anything, so why does backup need to know?
296 |    and the extra RPC costs time
297 | 
298 | Q: how could we make primary-only Get()s work?
299 | 
300 | Q: are there cases when the lab 2 protocol cannot make forward progress?
301 |    View service fails
302 |    Primary fails before new backup gets state
303 |    We will start fixing those in lab 3
304 | 


--------------------------------------------------------------------------------
/original-notes/l04-fds.txt:
--------------------------------------------------------------------------------
  1 | 6.824 2014 Lecture 4: FDS Case Study
  2 | 
  3 | Flat Datacenter Storage
  4 | Nightingale, Elson, Fan, Hofmann, Howell, Suzue
  5 | OSDI 2012
  6 |     
  7 | why are we looking at this paper?
  8 |   Lab 2 wants to be like this when it grows up
  9 |     though details are all different
 10 |   fantastic performance -- world record cluster sort
 11 |   good systems paper -- details from apps all the way to network
 12 | 
 13 | what is FDS?
 14 |   a cluster storage system
 15 |   stores giant blobs -- 128-bit ID, multi-megabyte content
 16 |   clients and servers connected by network with high bisection bandwidth
 17 |   for big-data processing (like MapReduce)
 18 |     cluster of 1000s of computers processing data in parallel
 19 | 
 20 | high-level design -- a common pattern
 21 |   lots of clients
 22 |   lots of storage servers ("tractservers")
 23 |   partition the data
 24 |   master ("metadata server") controls partitioning
 25 |   replica groups for reliability
 26 | 
 27 | why is this high-level design useful?
 28 |   1000s of disks of space
 29 |     store giant blobs, or many big blobs
 30 |   1000s of servers/disks/arms of parallel throughput
 31 |   can expand over time -- reconfiguration
 32 |   large pool of storage servers for instant replacement after failure
 33 | 
 34 | motivating app: MapReduce-style sort 
 35 |   a mapper reads its split 1/Mth of the input file (e.g., a tract)
 36 |     map emits a <key, record> for each record in split
 37 |     map partitions keys among R intermediate files  (M*R intermediate files in total)
 38 |   a reducer reads 1 of R intermediate files produced by each mapper
 39 |     reads M intermediate files (of 1/R size)
 40 |     sorts its input
 41 |     produces 1/Rth of the final sorted output file  (R blobs)
 42 |   FDS sort
 43 |      FDS sort does not store the intermediate files in FDS
 44 |      a client is both a mapper and reducer
 45 |      FDS sort is not locality-aware
 46 |         in mapreduce, master schedules workers on machine that are close to the data
 47 |         e.g.,  in same cluster
 48 |      later versions of FDS sort uses more fine-grained work assignment
 49 |        e.g., mapper doesn't get 1/N of the input file but something smaller
 50 |        deals better with stragglers   
 51 |     
 52 | The Abstract's main claims are about performance.
 53 |    They set the world-record for disk-to-disk sorting in 2012 for MinuteSort
 54 |       1,033 disks and 256 computers (136 tract servers, 120 clients)
 55 |       1,401 Gbyte in 59.4s
 56 | 
 57 | Q: does the abstract's 2 GByte/sec per client seem impressive?
 58 |    how fast can you read a file from Athena AFS? (abt 10 MB/sec)
 59 |    how fast can you read a typical hard drive?
 60 |    how fast can typical networks move data?
 61 | 
 62 | Q: abstract claims recover from lost disk (92 GB) in 6.2 seconds
 63 |    that's 15 GByte / sec
 64 |    impressive?
 65 |    how is that even possible? that's 30x the speed of a disk!
 66 |    who might care about this metric?
 67 | 
 68 | what should we want to know from the paper?
 69 |   API?
 70 |   layout?
 71 |   finding data?
 72 |   add a server?
 73 |   replication?
 74 |   failure handling?
 75 |   failure model?
 76 |   consistent reads/writes? (i.e. does a read see latest write?)
 77 |   config mgr failure handling?
 78 |   good performance?
 79 |   useful for apps?
 80 | 
 81 | * API
 82 |   Figure 1
 83 |   128-bit blob IDs
 84 |   blobs have a length
 85 |   only whole-tract read and write -- 8 MB
 86 | 
 87 | Q: why are 128-bit blob IDs a nice interface?
 88 |    why not file names?
 89 | 
 90 | Q: why do 8 MB tracts make sense?
 91 |    (Figure 3...)
 92 | 
 93 | Q: what kinds of client applications is the API aimed at?
 94 |    and not aimed at?
 95 | 
 96 | * Layout: how do they spread data over the servers?
 97 |   Section 2.2
 98 |   break each blob into 8 MB tracts
 99 |   TLT maintained by metadata server
100 |     has n entries
101 |     for blob b and tract t, i = (hash(b) + t) mod n
102 |     TLT[i] contains list of tractservers w/ copy of the tract
103 |   clients and servers all have copies of the latest TLT table
104 | 
105 | Example four-entry TLT with no replication:
106 |   0: S1
107 |   1: S2
108 |   2: S3
109 |   3: S4
110 |   suppose hash(27) = 2
111 |   then the tracts of blob 27 are laid out:
112 |   S1: 2 6
113 |   S2: 3 7
114 |   S3: 0 4 8
115 |   S4: 1 5 ...
116 |   FDS is "striping" blobs over servers at tract granularity
117 | 
118 | Q: why have tracts at all? why not store each blob on just one server?
119 |    what kinds of apps will benefit from striping?
120 |    what kinds of apps won't?
121 | 
122 | Q: how fast will a client be able to read a single tract?
123 | 
124 | Q: where does the abstract's single-client 2 GB number come from?
125 | 
126 | Q: why not the UNIX i-node approach?
127 |    store an array per blob, indexed by tract #, yielding tractserver
128 |    so you could make per-tract placement decisions
129 |      e.g. write new tract to most lightly loaded server
130 | 
131 | Q: why not hash(b + t)?
132 | 
133 | Q: how many TLT entries should there be?
134 |    how about n = number of tractservers?
135 |    why do they claim this works badly? Section 2.2
136 | 
137 | The system needs to choose server pairs (or triplets &c) to put in TLT entries
138 |    For replication
139 |    Section 3.3
140 | 
141 | Q: how about
142 |    0: S1 S2
143 |    1: S2 S1
144 |    2: S3 S4
145 |    3: S4 S3
146 |    ...
147 |    Why is this a bad idea?
148 |    How long will repair take?
149 |    What are the risks if two servers fail?
150 | 
151 | Q: why is the paper's n^2 scheme better?
152 |    TLT with n^2 entries, with every server pair occuring once
153 |    0: S1 S2
154 |    1: S1 S3
155 |    2: S1 S4
156 |    3: S2 S1
157 |    4: S2 S3
158 |    5: S2 S4
159 |    ...
160 |    How long will repair take?
161 |    What are the risks if two servers fail?
162 | 
163 | Q: why do they actually use a minimum replication level of 3?
164 |    same n^2 table as before, third server is randomly chosen
165 |    What effect on repair time?
166 |    What effect on two servers failing?
167 |    What if three disks fail?
168 | 
169 | * Adding a tractserver
170 |   To increase the amount of disk space / parallel throughput
171 |   Metadata server picks some random TLT entries
172 |   Substitutes new server for an existing server in those TLT entries
173 |   
174 | * How do they maintain n^2 plus one arrangement as servers leave join?
175 |   Unclear.
176 | 
177 | Q: how long will adding a tractserver take?
178 | 
179 | Q: what about client writes while tracts are being transferred?
180 |    receiving tractserver may have copies from client(s) and from old srvr
181 |    how does it know which is newest?
182 | 
183 | Q: what if a client reads/writes but has an old tract table?
184 |   
185 | * Replication
186 |   A writing client sends a copy to each tractserver in the TLT.
187 |   A reading client asks one tractserver.
188 | 
189 | Q: why don't they send writes through a primary?
190 | 
191 | Q: what problems are they likely to have because of lack of primary?
192 |    why weren't these problems show-stoppers?
193 | 
194 | * What happens after a tractserver fails?
195 |   Metadata server stops getting heartbeat RPCs
196 |   Picks random replacement for each TLT entry failed server was in
197 |   New TLT gets a new version number
198 |   Replacement servers fetch copies
199 | 
200 | Example of the tracts each server holds:
201 |   S1: 0 4 8 ...
202 |   S2: 0 1 ...
203 |   S3: 4 3 ...
204 |   S4: 8 2 ...
205 | 
206 | Q: why not just pick one replacement server?
207 | 
208 | Q: how long will it take to copy all the tracts?
209 | 
210 | Q: if a tractserver's net breaks and is then repaired, might srvr serve old data?
211 | 
212 | Q: if a server crashes and reboots with disk intact, can contents be used?
213 |    e.g. if it only missed a few writes?
214 |    3.2.1's "partial failure recovery"
215 |    but won't it have already been replaced?
216 |    how to know what writes it missed?
217 | 
218 | Q: when is it better to use 3.2.1's partial failure recovery?
219 | 
220 | * What happens when the metadata server crashes?
221 | 
222 | Q: while metadata server is down, can the system proceed?
223 | 
224 | Q: is there a backup metadata server?
225 | 
226 | Q: how does rebooted metadata server get a copy of the TLT?
227 | 
228 | Q: does their scheme seem correct?
229 |    how does the metadata server know it has heard from all tractservers?
230 |    how does it know all tractservers were up to date?
231 | 
232 | * Random issues
233 | 
234 | Q: is the metadata server likely to be a bottleneck?
235 | 
236 | Q: why do they need the scrubber application mentioned in 2.3?
237 |    why don't they delete the tracts when the blob is deleted?
238 |    can a blob be written after it is deleted?
239 | 
240 | * Performance
241 | 
242 | Q: how do we know we're seeing "good" performance?
243 |    what's the best you can expect?
244 | 
245 | Q: limiting resource for 2 GB / second single-client?
246 | 
247 | Q: Figure 4a: why starts low? why goes up? why levels off?
248 |    why does it level off at that particular performance?
249 | 
250 | Q: Figure 4b shows random r/w as fast as sequential (Figure 4a).
251 |    is this what you'd expect?
252 | 
253 | Q: why are writes slower than reads with replication in Figure 4c?
254 | 
255 | Q: where does the 92 GB in 6.2 seconds come from?
256 |    Table 1, 4th column
257 |    that's 15 GB / second, both read and written
258 |    1000 disks, triple replicated, 128 servers?
259 |    what's the limiting resource? disk? cpu? net?
260 | 
261 | How big is each sort bucket?
262 |   i.e. is the sort of each bucket in-memory?
263 |   1400 GB total
264 |   128 compute servers
265 |   between 12 and 96 GB of RAM each
266 |   hmm, say 50 on average, so total RAM may be 6400 GB
267 |   thus sort of each bucket is in memory, does not write passes to FDS
268 |   thus total time is just four transfers of 1400 GB
269 |     client limit: 128 * 2 GB/s = 256 GB / sec
270 |     disk limit: 1000 * 50 MB/s = 50 GB / sec
271 |   thus bottleneck is likely to be disk throughput
272 | 


--------------------------------------------------------------------------------
/original-notes/l10-treadmarks.txt:
--------------------------------------------------------------------------------
  1 | 6.824 2015 Lecture 10: Consistency
  2 | 
  3 | Topic: consistency models
  4 |   = Interaction of reads/writes on different processors
  5 |   Many choices of models!
  6 |   Lax model => greater freedom to optimize
  7 |   Strict model => matches programmer intuition (e.g. read sees latest write)
  8 |   This tradeoff is a huge factor in many designs
  9 |   Treadmarks is a case study of relaxing to improve performance
 10 | 
 11 | Treadmarks high level goals?
 12 |   Better DSM performance
 13 |   Run existing parallel code
 14 | 
 15 | What specific problems with previous DSM are they trying to fix?
 16 |   false sharing: two machines r/w different vars on same page
 17 |     M1 writes x, M2 writes y
 18 |     M1 writes x, M2 just reads y
 19 |     Q: what does IVY do in this situation?
 20 |   write amplification: a one byte write turns into a whole-page transfer
 21 | 
 22 | First Goal: eliminate write amplification
 23 |   don't send whole page, just written bytes
 24 | 
 25 | Big idea: write diffs
 26 |   on M1 write fault:
 27 |     tell other hosts to invalidate but keep hidden copy
 28 |     M1 makes hidden copy as well
 29 |   on M2 fault:
 30 |     M2 asks M1 for recent modifications
 31 |     M1 "diffs" current page against hidden copy
 32 |     M1 send diffs to M2 (and all machines w/ copy of this page)
 33 |     M2 applies diffs to its hidden copy
 34 |     M1 marks page r/o
 35 | 
 36 | Q: do write diffs change the consistency model?
 37 |    At most one writeable copy, so writes are ordered
 38 |    No writing while any copy is readable, so no stale reads
 39 |    Readable copies are up to date, so no stale reads
 40 |    Still sequentially consistent
 41 | 
 42 | Q: do write diffs fix false sharing?
 43 | 
 44 | Next goal: allow multiple readers+writers
 45 |   to cope with false sharing
 46 |   => don't invalidate others when a machine writes
 47 |   => don't demote writers to r/o when another machine reads
 48 |   => multiple *different* copies of a page!
 49 |      which should a reader look at?
 50 |   diffs help: can merge writes to same page
 51 |   but when to send the diffs?
 52 |     no invalidations -> no page faults -> what triggers sending diffs?
 53 | 
 54 | Big idea: release consistency (RC)
 55 |   no-one should read data w/o holding a lock!
 56 |     so let's assume a lock server
 57 |   send out write diffs on release
 58 |     to *all* machines with a copy of the written page(s)
 59 | 
 60 | Example 1 (RC and false sharing)
 61 | x and y are on the same page
 62 | M0: a1 for(...) x++ r1
 63 | M1: a2 for(...) y++ r2  a1 print x, y r1
 64 | What does RC do?
 65 |   M0 and M1 both get cached writeable copy of the page
 66 |   during release, each computes diffs against original page,
 67 |     and sends them to all copies
 68 |   M1's a1 causes it to wait until M0's release
 69 |     so M1 will see M0's writes
 70 | 
 71 | Q: what is the performance benefit of RC?
 72 |    What does IVY do with Example 1?
 73 |    multiple machines can have copies of a page, even when 1 or more writes
 74 |    => no bouncing of pages due to false sharing
 75 |    => read copies can co-exist with writers
 76 | 
 77 | Q: does RC change the consistency model? yes!
 78 |    M1 won't see M0's writes until M0 releases a lock
 79 |    I.e. M1 can see a stale copy of x; not possible w/ IVY
 80 |    if you always lock:
 81 |      locks force order -> no stale reads
 82 | 
 83 | Q: what if you don't lock?
 84 |    reads can return stale data
 85 |    concurrent writes to same var -> trouble
 86 | 
 87 | Q: does RC make sense without write diffs?
 88 |    probably not: diffs needed to reconcile concurrent writes to same page
 89 | 
 90 | Big idea: lazy release consistency (LRC)
 91 |   only send write diffs to next acquirer of released lock,
 92 |     not to everyone
 93 | 
 94 | Example 2 (lazyness)
 95 | x and y on same page (otherwise IVY avoids copy too)
 96 | everyone starts with a copy of that page
 97 | M0: a1 x=1 r1
 98 | M1:           a2 y=1 r2
 99 | M2:                     a1 print x r1
100 | What does LRC do?
101 |   M2 only asks previous holder of lock 1 for write diffs
102 |   M2 does not see M1's y=1, even tho on same page (so print y would be stale)
103 | What does RC do?
104 | What does IVY do?
105 | 
106 | Q: what's the performance win from LRC?
107 |    if you don't acquire lock on object, you don't see updates to it
108 |    => if you use just some vars on a page, you don't see writes to others
109 |    => less network traffic
110 | 
111 | Q: does LRC provide the same consistency model as RC?
112 |    no: LRC hides some writes that RC reveals
113 |    in above example, RC reveals y=1 to M2, LRC does not reveal
114 |    so "M2: print x, y" might print fresh data for RC, stale for LRC
115 |      depends on whether print is before/after M1's release
116 | 
117 | Q: is LRC a win over IVY if each variable on a separate page?
118 |    or a win over IVY plus write diffs?
119 |    note IVY's fault-driven page reads are lazy at page granularity
120 | 
121 | Do we think all threaded/locking code will work with LRC?
122 |   Stale reads unless every shared memory location is locked!
123 |   Do programs lock every shared memory location they read?
124 |   No: people lock to make updates atomic.
125 |       if no concurrent update possible, people don't lock.
126 | 
127 | Example 3 (programs don't lock all shared data)
128 | x, y, and z on the same page
129 | M0: x := 7 a1 y = &x r1
130 | M1:                    a1 a2 z = y r2 r1
131 | M2:                                       a2 print *z r2
132 | will M2 print 7?
133 | LRC as described so far in this lecture would *not* print 7!
134 |   M2 will see the pointer in z, but will have stale content in x's memory.
135 | 
136 | For real programs to work, Treadmarks must provide "causal consistency":
137 |   when you see a value,
138 |     you also see other values which might have influenced its computation.
139 |   "influenced" means "processor might have read".
140 | 
141 | How to track which writes influenced a value?
142 |   Number each machine's releases -- "interval" numbers
143 |   Each machine tracks highest write it has seen from each other machine
144 |     a "Vector Timestamp"
145 |   Tag each release with current VT
146 |   On acquire, tell previous holder your VT
147 |     difference indicates which writes need to be sent
148 |   (annotate previous example)
149 | 
150 | VTs order writes to same variable by different machines:
151 | M0: a1 x=1 r1  a2 y=9 r2
152 | M1:              a1 x=2 r1
153 | M2:                           a1 a2 z = x + y r2 r1
154 | M2 is going to hear "x=1" from M0, and "x=2" from M1.
155 |   How does M2 know what to do?
156 | 
157 | Could the VTs for two values of the same variable not be ordered?
158 | M0: a1 x=1 r1
159 | M1:              a2 x=2 r2
160 | M2:                           a1 a2 print x r2 r1
161 | 
162 | Summary of programmer rules / system guarantees
163 |   1. each shared variable protected by some lock
164 |   2. lock before writing a shared variable
165 |      to order writes to same var
166 |      otherwise "latest value" not well defined
167 |   3. lock before reading a shared variable
168 |      to get the latest version
169 |   4. if no lock for read, guaranteed to see values that
170 |      contributed to the variables you did lock
171 | 
172 | Example of when LRC might work too hard.
173 | M0: a2 z=99 r2  a1 x=1 r1
174 | M1:                            a1 y=x r1
175 | TreadMarks will send z to M1, because it comes before x=1 in VT order.
176 |   Assuming x and z are on the same page.
177 |   Even if on different pages, M1 must invalidate z's page.
178 | But M1 doesn't use z.
179 | How could a system understand that z isn't needed?
180 |   Require locking of all data you read
181 |   => Relax the causal part of the LRC model
182 | 
183 | Q: could TreadMarks work without using VM page protection?
184 |    it uses VM to
185 |      detect writes to avoid making hidden copies (for diffs) if not needed
186 |      detect reads to pages => know whether to fetch a diff
187 |    neither is really crucial
188 |    so TM doesn't depend on VM as much as IVY does
189 |      IVY used VM faults to decide what data has to be moved, and when
190 |      TM uses acquire()/release() and diffs for that purpose
191 | 
192 | Performance?
193 | 
194 | Figure 3 shows mostly good scaling
195 |   is that the same as "good"?
196 |   though apparently Water does lots of locking / sharing
197 | 
198 | How close are they to best possible performance?
199 |   maybe Figure 5 implies there is only about 20% fat to be cut
200 | 
201 | Does LRC beat previous DSM schemes?
202 |   they only compare against their own straw-man ERC
203 |     not against best known prior work
204 |   Figure 9 suggests lazyness only a win for Water
205 |     most pages used by most processors, so eager moves a lot of data
206 | 
207 | What happened to DSM?
208 |   The cluster approach was a great idea
209 |   Targeting *existing* threaded code was not a long-term win
210 |   Overtaken by MapReduce and successors
211 |     MR tolerates faults
212 |     MR guides programmer to good split of data and computation
213 |     BUT people have found MR too rigid for many parallel tasks
214 |   The last word has not been spoken here
215 |     Much recent work on flexible memory-like cluster programming models
216 |     RDDs/Spark, FaRM, Piccolo
217 | 


--------------------------------------------------------------------------------
/original-notes/l14-spark.txt:
--------------------------------------------------------------------------------
  1 | 6.824 2014 Lecture 4: Spark Case Study
  2 | 
  3 | Resilient Distributed Datasets: A Fault-Tolerant Abstraction for In-Memory Cluster Computing
  4 | Zaharia, Chowdhury, Das, Dave, Ma, McCauley, Franklin, Shenker, Stoica
  5 | NSDI 2012
  6 | 
  7 | Had TreadMarks since 1996, and Distributed Shared Memory is a very general abstraction. Why use MapReduce? Or why even use TreadMarks?
  8 | Say looking through a log, why not implement it using the regular abstractions (sockets, files etc?)
  9 |   Saves a lot of work:
 10 |     communication between nodes
 11 |     distribute code
 12 |     schedule work
 13 |     handle failures
 14 | 
 15 | 
 16 | The MapReduce paper had a lot of impact on big data analytics: simple and powerful.
 17 | But bit too rigid. Other systems proposed fixes:
 18 | 
 19 | Dryad (Microsoft 2007): any directed acyclic graph, edges are communication channels, can be through disk or via TCP.
 20 |   + can implement multiple iterations
 21 |   + can pipeline through RAM, don't have to go to disk
 22 |   - very low level: 
 23 |       doesn't deal with partitioning of data, want 100,000 mappers? add 100,000 nodes
 24 |       what happens if you run out of RAM? (brief mention of "downgrading" a TCP channel to a disk file)
 25 |   - doesn't checkpoint/replicate, in the middle of the run (so failures can be expensive)
 26 | 
 27 | * Pig latin (Yahoo 2008): programming language that compiles to MapReduce. Adds "Database style" operators, mainly Join
 28 | Join: dataset 1 (k1,v1), dataset 2 (k1, v2). ==> (k1, v1, v2), takes cartesian product (all tuples of combinations of v1, v2 with same k1)
 29 | Example: dataset 1: all clicks on products on website, dataset 2: demographics (age of users), want average age of customer per product.
 30 |   + allows multiple iterations
 31 |   + can express more
 32 |   - still has rigidness from MR (writes to disk after map, to replicated storage after reduce, RAM)
 33 |   
 34 | 
 35 | Spark
 36 | 
 37 | A framework for large scale distributed computation. 
 38 |   An expressive programming model (can express iteration and joins)
 39 |   Gives user control over trade off between fault tolerance with performance
 40 |      if user frequently perist w/REPLICATE, fast recovery, but slower execution
 41 |      if infrequently, fast execution but slow recovery
 42 | 
 43 | Relatively recent release, but used by (partial list) IBM, Groupon, Yahoo, Baidu..
 44 | Can get substantial performance gains when dataset (or a major part of it) can fit in memory, so anticipated to get more traction.
 45 | MapReduce is simple
 46 | 
 47 | Abstraction of Resilient Distributed Datasets: an RDD is a collection of partitions of records.
 48 | Two operations on RDDs:
 49 |   Transformations: compute a new RDD from existing RDDs (flatMap, reduceByKey)
 50 |     this just specifies a plan. runtime is lazy - doesn't have to materialize (compute), so it doesn't
 51 |   Actions: where some effect is requested: result to be stored, get specific value, etc.
 52 |     causes RDDs to materialize.
 53 | 
 54 | Logistic regression (from paper):
 55 | val points = spark.textFile(...)
 56 | 		.map(parsePoint).persist()
 57 | var w = // random initial vector
 58 | for (i <- 1 to ITERATIONS) {
 59 |     val gradient = points.map{ p =>
 60 | 	p.x * (1/(1+exp(-p.y*(w dot p.x)))-1)*p.y
 61 |     }.reduce((a,b) => a+b)
 62 |     w -= gradient
 63 | }
 64 | 
 65 | * w is sent with the closure to the nodes
 66 | * materializes a new RDD in every loop iteration
 67 | 
 68 | 
 69 | PageRank (from paper):
 70 | val links = spark.textFile(...).map(...).persist() // (URL, outlinks)
 71 | var ranks = // RDD of (URL, rank) pairs
 72 | for (i <- 1 to ITERATIONS) {
 73 |   // Build an RDD of (targetURL, float) pairs
 74 |   // with the contributions sent by each page
 75 |   val contribs = links.join(ranks).flatMap {
 76 |      (url, (links, rank)) =>
 77 |     links.map(dest => (dest, rank/links.size))
 78 | }
 79 | // Sum contributions by URL and get new ranks
 80 |   ranks = contribs.reduceByKey((x,y) => x+y)
 81 |      .mapValues(sum => a/N + (1-a)*sum)
 82 | }
 83 | 
 84 | 
 85 | What is an RDD (table 3, S4)
 86 |   list of partitions
 87 |   list of (parent RDD, wide/narrow dependency)
 88 |   function to compute
 89 |   partitioning scheme
 90 |   computation placement hint
 91 | Each transformation takes (one or more) RDDs, and outputs the transformed RDD.
 92 | 
 93 | Q: Why does an RDD carry metadata on its partitioning?
 94 | A: so transformations that depend on multiple RDDs know whether they need to shuffle data (wide dependency) or not (narrow)
 95 | Allows users control over locality and reduces shuffles.
 96 | 
 97 | Q: Why the distinction between narrow and wide dependencies?
 98 | A: In case of failure.
 99 |   narrow dependency only depends on a few partitions that need to be recomputed.
100 |   wide dependency might require an entire RDD
101 | 
102 | Handling faults.
103 | When Spark computes, by default it only generates one copy of the result, doesn't replicate. Without replication, no matter if it's put in RAM or disk, if node fails, on permanent failure, data is gone.
104 | When some partition is lost and needs to be recomputed, the scheduler needs to find a way to recompute it. (a fault can be detected by using a heartbeat)
105 |   will need to compute all partitions it depends on, until a partition in RAM/disk, or in replicated storage.
106 |   if wide dependency, will need all partitions of that dependency to recompute, if narrow just one that RDD
107 |   
108 | So two mechanisms enable recovery from faults: lineage, and policy of what partitions to persist (either to one node or replicated)
109 | We talked about lineage before (Transformations)
110 | 
111 | The user can call persist on an RDD.
112 |   With RELIABLE flag, will keep multiple copies (in RAM if possible, disk if RAM is full)
113 |   With REPLICATE flag, will write to stable storage (HDFS)
114 |   Without flags, will try to keep in RAM (will spill to disk when RAM is full)
115 | 
116 | Q: Is persist a transformation or an action?
117 | A: neither. It doesn't create a new RDD, and doesn't cause materialization. It's an instruction to the scheduler.
118 | 
119 | Q: By calling persist without flags, is it guaranteed that in case of fault that RDD wouldn't have to be recomputed?
120 | A: No. There is no replication, so a node holding a partition could fail.
121 | Replication (either in RAM or in stable storage) is necessary
122 | 
123 | Currently only manual checkpointing via calls to persist.
124 | Q: Why implement checkpointing? (it's expensive)
125 | A: Long lineage could cause large recovery time. Or when there are wide dependencies a single failure might require many partition re-computations.
126 | 
127 | Checkpointing is like buying insurance: pay writing to stable storage so can recover faster in case of fault.
128 | Depends on frequency of failure and on cost of slower recovery
129 | An automatic checkpointing will take these into account, together with size of data (how much time it takes to write), and computation time.
130 | 
131 | So can handle a node failure by recomputing lineage up to partitions that can be read from RAM/Disk/replicated storage.
132 | Q: Can Spark handle network partitions?
133 | A: Nodes that cannot communicate with scheduler will appear dead. The part of the network that can be reached from scheduler can continue
134 |   computation, as long as it has enough data to start the lineage from (if all replicas of a required partition cannot be reached, cluster
135 |   cannot make progress)
136 | 
137 | 
138 | What happens when there isn't enough memory?
139 |   - LRU (Least Recently Used) on partitions
140 |     - first on non-persisted
141 |     - then persisted (but they will be available on disk. makes sure user cannot overbook RAM)
142 |   - user can have control on order of eviction via "persistence priority"
143 |   - no reason not to discard non-persisted partitions (if they've already been used)
144 | 
145 | Shouldn't throw away partitions in RAM that are required but hadn't been used.
146 | 
147 | degrades to "almost" MapReduce behavior 
148 | In figure 7, k-means on 100 Hadoop nodes takes 76-80 seconds
149 | In figure 12, k-means on 25 Spark nodes (with no partitions allowed in memory) takes 68.8
150 | Difference could be because MapReduce would use replicated storage after reduce, but Spark by default would only spill to local disk, no network latency and I/O load on replicas.
151 | no architectural reason why Spark would be slower than MR
152 | 
153 | Spark assumes it runs on an isolated memory space (multiple schedulers don't share the memory pool well).
154 | Can be solved using a "unified memory manager"
155 | Note that when there is reuse of RDDs between jobs, they need to run on the same scheduler to benefit anyway.
156 | 
157 | 
158 | 
159 | (from [P09])
160 | Why not just use parallel databases? Commercially available: "Teradata, Aster Data, Netezza, DATAl-
161 | legro (and therefore soon Microsoft SQL Server via Project Madi-
162 | son), Dataupia, Vertica, ParAccel, Neoview, Greenplum, DB2 (via
163 | the Database Partitioning Feature), and Oracle (via Exadata)"
164 | 
165 | At the time, Parallel DBMS were
166 |  * Some are expensive and Hard to set up right
167 |  * SQL declarative (vs. procedural)
168 |  * Required schema, indices etc (an advantages in some cases)
169 |  * "Not made here"
170 | 
171 | 
172 | Picollo [P10] uses snapshots of a distributed key-value store to handle fault tolerance.
173 | - Computation is comprised of control functions and kernel functions.
174 | - Control functions are responsible for setting up tables (also locality), launching kernels, synchronization (barriers that wait for all kernels to complete), and starting checkpoints
175 | - Kernels use the key value store. There is a function to merge conflicting writes.
176 | - Checkpoints using Chandy-Lamport
177 | * all data has to fit in RAM
178 | * to recover, all nodes need to revert (expensive)
179 | * no way to mitigate stragglers, cannot just re-run a kernel without reverting to a snapshot
180 | 
181 | 
182 | 
183 | [P09] "A Comparison of Approaches to Large-Scale Data Analysis", Pavlo et al. SIGMOD'09
184 | [P10] Piccolo: Building Fast, Distributed Programs with Partitioned Tables, Power and Li, OSDI'10
185 | 
186 | 


--------------------------------------------------------------------------------
/original-notes/l23-bitcoin.txt:
--------------------------------------------------------------------------------
  1 | 6.824 2015 Lecture 23: Bitcoin
  2 | 
  3 | Bitcoin: A Peer-to-Peer Electronic Cash System, by Satoshi Nakamoto
  4 | 2008
  5 | 
  6 | why aren't credit cards the perfect digital money?
  7 |   +  work online 
  8 |   +  hard to steal (a complex situation)
  9 |   +- can cancel transactions, call customer support
 10 |   -  relies on 3rd parties to verify (banks)
 11 |      b/c users cannot directly verify charges
 12 |   -  3% fee
 13 |   -  long settling time
 14 |   -  hard to become a merchant
 15 |   +- tied to currency controlled by government
 16 | 
 17 | bitcoin: e-money without a central trusted party
 18 |   a public ledger: anyone can verify transactions
 19 | 
 20 | what's hard technically?
 21 |   forgery
 22 |   double spending
 23 |   theft
 24 | 
 25 | what's hard socially/economically?
 26 |   why does it have value?
 27 |   how to pay for infrastructure?
 28 |   monetary policy (intentional inflation &c)
 29 |   laws (taxes, laundering, drugs, terrorists)
 30 | 
 31 | let's design OneBit, a simple e-money system
 32 |   to illustrate a public, verifiable ledger using transaction chain
 33 |   each user owns some coins
 34 |   single server -- OneBank -- everyone talks to it
 35 |   OneBank records all transactions
 36 | 
 37 | OneBit transactions
 38 |   every coin has a chain of transaction records
 39 |     one for each time this coin was transferred as payment
 40 |   OneBank maintains the complete chain for each coin
 41 |   chain helps ensure that only the current owner spends
 42 | 
 43 | what's in a OneBit transaction record?
 44 |   public key of new owner
 45 |   hash of this coin's previous transaction record
 46 |   signed by private key of previous owner
 47 |   (BitCoin is much more complex: amount (fractional), multiple in/out, ...)
 48 | 
 49 | OneBit example:
 50 |   Y owns a coin, previously given to it by X:
 51 |     T7: pub(Y), hash(T6), sig(X)
 52 |   Y buys a hamburger from Z and pays with this coin
 53 |     Z tells Y Z's public key ("address")
 54 |     Y creates a new transaction and signs it
 55 |     T8: pub(Z), hash(T7), sig(Y)
 56 |   OneBank verifies that:
 57 |     no other transaction mentions hash(T7),
 58 |     T8's sig() corresponds to T7's pub()
 59 |   Z waits until OneBank has seen/verified T8,
 60 |     verifies that T8's pub() is Z's public key,
 61 |     then Z gives hamburger to Y
 62 | 
 63 | why include pub(Z)?
 64 | 
 65 | why sign with sig(Y)?
 66 | 
 67 | why include hash(T7)?
 68 |   hash(T7) identifies the exact coin to be spent
 69 |   a coin ID scheme might be ambiguous if Y owned this coin previously
 70 | 
 71 | where is Z's resulting coin value "stored"?
 72 |   coin balance = unspent xaction
 73 |   the "identity" of a coin is the (hash of) its most recent xaction
 74 |   Z "owns" the coin: has private key that allows Z to make next xaction
 75 | 
 76 | does OneBit's transaction chain prevent stealing?
 77 |   current owner's private key needed to sign next transaction
 78 |   danger: attacker can steal Z's private key
 79 |     Z uses private key a lot, so probably on his PC, easy to steal?
 80 |   a significant problem for BitCoin in practice
 81 | 
 82 | what if OneBank is corrupt?
 83 |   it can't forge transactions (doesn't know private keys)
 84 |   but it can help people double-spend!
 85 | 
 86 | double-spending with OneBit
 87 |   suppose OneBank is cooperating with Y to cheat Z or Q
 88 |   Y creates two transactions for same coin: Y->Z, Y->Q
 89 |     both have has pointing to same current end of chain
 90 |   OneBank shows chain ending in Y->Z to Z, and Y->Q to Q
 91 |     both transactions look good, including signatures and hash
 92 |   now both Z and Q will give hamburgers to Y
 93 | 
 94 | why was double-spending possible?
 95 |   OneBank can *hide* some information,
 96 |   or selectively reveal it
 97 | 
 98 | what's needed?
 99 |   many servers ("peers")
100 |   send all transactions to all peers
101 |   much harder for a few bad peers to hide transactions
102 |   conventions to un-fork if problems do arise
103 | 
104 | the BitCoin block chain
105 |   single block chain contains transactions on all coins
106 |   a copy stored in each peer
107 |     so each peer can validate new transactions against old ones
108 |   each block:
109 |     hash(prevblock)
110 |     set of transactions
111 |     "nonce" (not quite a nonce in the usual cryptographic sense)
112 |     current time (wall clock timestamp)
113 |   a transaction isn't real unless it's in the block chain
114 |   new block every 10 minutes containing xactions since prev block
115 | 
116 | who creates each new block?
117 |   all the peers try
118 |   requirement: hash(block) < "target"
119 |   peers try nonce values until this works out
120 |   can't predict a winning nonce, since cryptographic hash
121 |   trying one nonce is fast, but most nonces won't work
122 |   it would take one CPU months to create one block
123 |   but thousands of peers are working on it
124 |   such that expected time to first to find is about 10 minutes
125 |   the winner sends the new block to all peers
126 |   (this is part of "mining")
127 | 
128 | how do transactions work w/ block chain?
129 |   start: all peers know ...<-B5
130 |     and are working on B6 (trying different nonces)
131 |   Z sends public key (address) to Y
132 |   Y sends Y->Z transaction to peers, which flood it
133 |   peers buffer the transaction until B6 computed
134 |   peers that heard Y->Z include it in next block
135 |   so eventually ...<-B5<-B6<-B7, where B7 includes Y->Z
136 | 
137 | what if bad person wants to double-spend?
138 |   start with block chain ...<-B6
139 |   Y gets Y->Z into block chain
140 |     ...<-B6<-BZ (BZ contains Y->Z)
141 |   Z will see Y->Z in chain and give Y a hamburger
142 |   can Y create ...<-B6<-BQ
143 |     and persuade peers to accept it in place of ...<-B6<-BZ?
144 | 
145 | when will a peer accept chain CX it hears from another peer?
146 |   suppose peer already knows of chain CY
147 |   it will accept CX if len(CX) > len(CY)
148 |   and if CX passes some validity tests
149 |   will not accept if len(CX) = len(CY): first chain of same length wins
150 | 
151 | so attacker needs a longer chain to double-spend
152 |   e.g. ...<-B6<-BQ<-B8, which is longer than ...<-B6<-BZ
153 |   and must create it in less than 10 minutes
154 |     *before* main chain grows by another block
155 |   10 minutes is the time it takes the 1000s of honest peers
156 |     to find one block
157 |   if the attacker has just one CPU, will take months to create the blocks
158 |     by that time the main chain will be much longer
159 |     no peer will accept the attacker's shorter chain
160 |     attacker has no chance
161 |   if the attacker has 1000s of CPUs -- more than all the honest
162 |     bitcoin peers -- then the attacker can double spend
163 |  
164 | summary:
165 |   attacker can force honest peers to switch chains if attacker
166 |     controls majority of peer CPU power
167 | 
168 | how long should Z wait before giving Y the hamburger?
169 |   until Z sees Y flood the transaction to many peers?
170 |     no -- not in the chain, Y might flood conflicting xaction
171 |     maybe OK for low-value transactions
172 |   until Z sees chain ...<-BZ?
173 |     maybe
174 |     risky -- non-zero chance that some other chain will win
175 |       i.e. some lucky machine discovered a few blocks in a row
176 |         quickly, but its network msgs have so far been lost
177 |       perhaps that chain won't have Y->Z
178 |       probability goes down rapidly with number of blocks
179 |   until Z sees chain with multiple blocks after BZ?
180 |     yes -- slim chance attacker with few CPUs could catch up
181 | 
182 | validation checks:
183 |   much of burden is on (honest) peers, to check new xactions/blocks
184 |     to avoid ever having to scan the whole block chain
185 |     and so that clients don't have to maintain the whole block chain
186 |   peer, new xaction:
187 |     no other transaction refers to same previous transaction
188 |     signature is by private key of previous transaction
189 |     [ then will add transaction to txn list for new block being mined ]
190 |   peer, new block:
191 |     hash value < target (i.e. nonce is right, proof of work)
192 |     previous block hash exists
193 |     new chain longer than current chain
194 |     all transactions in block are valid
195 |   Z:
196 |     Y->Z is in a recent block
197 |     Z's public key / address is in the transaction
198 |     multiple peers have accepted that block
199 |     there's several more blocks in the chain
200 |   (other stuff has to be checked as well, lots of details)
201 | 
202 | where does each bitcoin originally come from?
203 |   each time a peer creates a block, it gets 25 bitcoins
204 |     assuming it is the winner for that block
205 |   it puts its public key in a special transaction in the block
206 |   this is incentive for people to operate bitcoin peers
207 |   but that number halves every 210,000 blocks (abt 4 years)
208 |   the point: motivate people to run peers
209 | 
210 | Q: how do peers communicate / find each other?
211 | 
212 | Q: what prevents a bad peer from modifying an existing block?
213 | 
214 | Q: what if two nodes disagree on the validity of a transaction?
215 |    (slight implementation differences between software versions)
216 | 
217 | Q: 10 minutes is annoying; could it be made much shorter?
218 | 
219 | Q: are transactions anonymous?  pseudonymous?  analogy: IP addresses.
220 | 
221 | Q: can bitcoins be stolen?
222 | 
223 | Q: if I steal bitcoins, is it safe to spend them?
224 | 
225 | Q: what can adversary do with a majority of CPU power in the world?
226 |    can double-spend
227 |    cannot steal others' bitcoins
228 |    can prevent xaction from entering chain
229 |    can revert past xactions (by building longer chain from before that block)
230 | 
231 | Q: how rich are you likely to get with one machine mining?
232 | 
233 | Q: if more people (CPUs) mine, will that create new bitcoin faster?
234 |    important use of block timestamps: control difficulty (hash target)
235 | 
236 | Q: why mine at all? why not start with a fixed number of coins?
237 | 
238 | Q: why does it make sense for the mining reward to decrease with time?
239 | 
240 | Q: is it a problem that there will be a fixed number of coins?
241 | 
242 | Q: what if the real economy grows (or shrinks)?
243 | 
244 | Q: why do bitcoins have value?
245 |    e.g., 1 BTC appears to be around US$242 on may 12th 2015.
246 | 
247 | Q: will we still need banks, credit cards, &c?
248 |    today, dollar bills are only a small fraction of total money
249 |    same may be true of bitcoin
250 |    so properties of bitcoin (anonymity &c) may not be very important
251 | 
252 | Q: what are the benefits of banks, credit cards, &c?
253 |    disputes (Z takes payment and does not give hamburger to Y)
254 |    loss / recovery (user cannot find their private key)
255 | 
256 | Q: will bitcoin scale well?
257 |    as transaction rate increases?
258 |      claim CPU limits to 4,000 tps (signature checks)
259 |      more than Visa but less than cash
260 |    as block chain length increases?
261 |      do you ever need to look at very old blocks?
262 |      do you ever need to xfer the whole block chain?
263 |      merkle tree: block headers vs txn data.
264 | 
265 | key idea: block chain
266 |   public ledger is a great idea
267 |   decentralization might be good
268 |   mining seems imperfect, but does avoid centralized trust
269 |   tieing ledger to new currency seems bad
270 | 


--------------------------------------------------------------------------------
/original-notes/pbft-2001.txt:
--------------------------------------------------------------------------------
 1 | 6.824 2001 Lecture 23: Impractical Byzantine Agreement
 2 | 
 3 | The problem:
 4 |   Enemy camp.
 5 |   Three armies, three generals, G0, G1, G2.
 6 |   Can communicate with trustworthy messengers.
 7 |   They need to agree on whether to attack at dawn.
 8 |   But one of the generals might be a traitor.
 9 |   Two loyal generals needed for victory; defeat if only one attacks.
10 | 
11 | Straw man:
12 |   In advance the generals designate G0 as the leader.
13 |   G0 sends an order to each of G1 and G2.
14 |   RULE: G1 and G2 follow the order.
15 |   If G1 or G2 is the traitor, this procedure works:
16 |     G0 and the non-traitor both attack (or both do nothing).
17 |   What if G0 is the traitor?
18 |     G0 could tell G1 to attack, and G2 to do nothing.
19 | 
20 | Can G1 and G2 detect G0's treachery by comparing notes?
21 |   G1 and G2 tell each other what G0 told them.
22 |   RULE: if Gx sees same from G0 and Gy, obey.
23 |     Otherwise do nothing.
24 |   If G0 is the traitor, this procedure will detect it.
25 |   But does it still work if G1 (or G2) is the traitor?
26 |     Suppose G0 tells both to attack.
27 |     G1 tells G2 "G0 told me to do nothing".
28 |     So G2 decides G0 is the traitor, and does nothing, while G0 attacks.
29 | 
30 | Why is this problem interesting?
31 |   We've talked a lot about replicas: Bayou, Porcupine, DDS.
32 |   We've assumed replicas are fail-stop.
33 |     I.e. a typical failure is crash, power failure, network failure.
34 |   What if replica software malfunctions?
35 |     Sends incorrect messages, performs incorrect operations on data.
36 |   What if a replica is run by a malicious operator?
37 | 
38 | Byzantine Agreement is what we need if replicas can't be trusted.
39 |   Generals == replicas.
40 |   Orders == update operations.
41 |   Single-copy correctness demands that all replicas perform the same ops.
42 |     In the same order.
43 |   So "agreement to attack" == all replicas agree on next update.
44 |   And traitor == a replica that wants an incorrect result:
45 |     Hard to forge actual operations in the real world.
46 |     Traitor might cause inconsistent order of operations.
47 |     Traitor might prevent any progress from being made.
48 |   Assumption of only one traitor == withstand single failure.
49 |     More generally, want to withstand some fraction of failures.
50 | 
51 | Back to the original problem.
52 |   We ran into trouble trusting G1's claim "G0 told me to do nothing".
53 |   We can fix this with digital signatures.
54 |   Now the basic algorithm is:
55 |     G0 sends signed orders to G1 and G2.
56 |     G1 and G2 exchange those orders.
57 |     Signatures allow us to ignore fake quoted orders.
58 |   RULE: if Gx gets the same from G0 and Gy, obey.
59 |     Otherwise do nothing.
60 |   If G0 is the traitor and sends different orders,
61 |     G1 and G2 will both retreat -- OK.
62 |   If G1 is the traitor, what can he do?
63 |     Cannot forge a contrary order from G0.
64 |     BUT G1 can just not send anything to G2!
65 |     Then G0 will attack, but G2 will wait forever (and not attack).
66 | 
67 | So we have the danger that delayed msgs can cause incorrect results!
68 |   Rather than, as expected, merely delaying execution.
69 |   Can we use non-receipt as a sign a general is corrupt?
70 |   RULE: If Gx gets cmd from G0, and nothing from Gy, follow cmd.
71 |     If Gx gets cmd from G0 and Gy, follow it.
72 |     Otherwise do nothing.
73 |   Then a traitorous G0 can cause incorrect operation:
74 |     Send "attack" to G2. Send nothing to G1.
75 |     G2 will time out and attack; G0 and G1 won't.
76 |   
77 | Note that we still have a problem even if G0, G1 and G2 are loyal.
78 |   Adversary may delay or discard messages.
79 | 


--------------------------------------------------------------------------------
/original-notes/pbft-2009.txt:
--------------------------------------------------------------------------------
  1 | 6.824 2009 Lecture 19:  Security: Byzantine Fault Tolerance
  2 | 
  3 | Failures in labs 7 and 8:
  4 | - Nodes crash and stop responding
  5 | - Failure detector (heartbeater) to detect failures
  6 |   - detector can make mistakes
  7 |     - Network delays are arbitrary; nothing we can do about this
  8 |   - however, detector will *eventually* remove all failed nodes
  9 |   - this is crucial for the replication protocol to work
 10 | 
 11 | Byzantine failure model:
 12 | - nodes fail in *arbitrary* ways
 13 | - often thought of as ``adversarial'' model
 14 |   - node is compromised, attacker tries to break your protocol
 15 | - can also handle bugs, misconfigurations, etc.
 16 | - as before, must assume uncorrelated failures
 17 |   - design verification + n-version programming
 18 | - *can't* write a failure detector to eventually detect all Byzantine faults
 19 | 
 20 | RSM Protocol from the labs:
 21 | - 3 parts: replication protocol, view changes, recovery
 22 | - Replication protocol:
 23 |   - Primary sends op to all the backups
 24 |   - Backups execute; may have to roll back via state xfer if primary fails
 25 |   - Primary replies to client after hearing from all backups
 26 | - View changes (Paxos)
 27 | - Recovery:
 28 |   - Needed if a view change caused the primary to change
 29 |   - Correctness conditions:
 30 |     - If the client got a reply for a request in the previous view,
 31 |       request must carry forward to this view
 32 |     - All replicas must agree on state of the system
 33 |   - Any backup in the old view knows at least as much as the old primary
 34 |     - pick one, all replicas download state from that backup
 35 | 
 36 | Q: Do we actually need all the replicas to reply in the replication protocol?
 37 | A: No. f+1 responses are enough, but it complicates recovery
 38 |    - need to poll f+1 replicas, and recover from the one that is most
 39 |      up-to-date
 40 |    - viewstamped replication does this
 41 | 
 42 | Today, will show how to adapt this protocol to handle Byzantine faults.
 43 | - BFT protocol is based on viewstamped replication
 44 | - VR (Oki&Liskov 88) is basically the same protocol as the one from the labs,
 45 |   with the f+1 modification discussed above
 46 | 
 47 | How many replicas do we need to handle f fail-stop faults?
 48 | - f+1 will ensure integrity but not availability (e.g., 2PC)
 49 |   - f nodes fail, remaining node still has the data
 50 | - 2f+1 can ensure availability + durability (e.g., Paxos)
 51 |   - f nodes fail, remaining f+1 are a majority and can still make decisions
 52 | 
 53 | How many replicas do we need to handle f Byzantine faults?
 54 | - f+1 won't work at all; f Byzantine nodes can always outvote 1 correct node
 55 | - 2f+1 can preserve integrity *IF* we hear from all 2f+1
 56 |   - does NOT ensure availability
 57 |   - can't wait for last f nodes to reply; they might be Byzantine
 58 |   - why aren't f+1 (matching) replies enough?
 59 |     - example: f=1; replicas A, B, C; A is faulty; x is 0
 60 |       - client 1: write x=1, get replies from A and B
 61 |       - client 2: read x, get replies from A and C (A equivocates, says x=0)
 62 | - 3f+1 replicas preserve integrity and availability (safety + liveness)
 63 |   - use a quorum of 2f+1 replicas for every op (can't wait for the last f)
 64 |   - any two quorums of 2f+1 must intersect in at least one good replica
 65 |     - good replicas will never agree to conflicting values
 66 | 
 67 | Q: How does this compare to SUNDR?
 68 | 
 69 | PBFT attempt 1:
 70 | - Use RSM protocol from lab, fixed size group of 3f+1 replicas
 71 | - Sign all client requests and messages to handle Byzantine nodes
 72 | - Protocol:
 73 |   - Replication protocol:
 74 |     - primary sends op
 75 |     - 2f+1 replicas execute it and reply
 76 |     - primary replies to client with 2f+1 matching responses
 77 |   - View change and recovery protocols:
 78 |     - do view change if it seems the primary isn't making progress
 79 |     - will discuss later
 80 | - Problem: Byzantine primary can send different ops to different replicas
 81 | 
 82 | PBFT attempt 2:
 83 | - nodes don't execute an op until they know that 2f+1 replicas have
 84 |   assigned the same vs to the same op
 85 | - Replication protocol:
 86 |   Client->primary:   S_c(op)
 87 |   Primary->replicas: S_primary(PREPREPARE(S_c(op), vs))
 88 |   Replicas->primary: S_rep(PREPARE(op, vs))
 89 |   Primary->replicas: { set of 2f+1 prepares } = prepared certificate
 90 |   Replicas->Primary: S_rep(REPLY(rep, vs))
 91 |   Primary->Client:   { set of 2f+1 replies }
 92 | 
 93 | Q: What do replicas need to check before they can send a prepare?
 94 | A:
 95 |    - correct view, not in the middle of recovery / view change, etc.
 96 |    - valid signature from client
 97 |    - valid signature from primary
 98 |    - already prepared all requests with lower sequence numbers (why?)
 99 | 
100 | Q: What is the commit point?
101 | A: When f+1 non-faulty replicas have a prepared certificate.
102 |    Need to talk about view changes to understand this.
103 | 
104 | Q: Is this protocol correct?
105 | A: From the client's POV, no problem if it gets 2f+1 replies with
106 |    matching viewstamps. (This proves we reached the commit point.)
107 |    But the replicas have no idea when requests have committed;
108 |    this makes checkpoints / garbage collection impossible.
109 | 
110 | NB: In the lab, we don't worry about GC or concurrent requests;
111 | backups don't care whether the primary executed the op or not.
112 | 
113 | Full PBFT replication protocol:
114 | - Add a commit phase to tell the replicas that the request committed
115 |   in the current view. Replicas send S_rep(COMMIT(op, vs)) to the
116 |   primary when they have a prepared certificate, and the primary
117 |   forwards a set of 2f+1 commits to all the replicas.
118 | 
119 | Differences between what I described and the paper:
120 | - the version I described uses the tentative execution optimization
121 |   (see sec 5.1); similar to lab 8
122 | - the version in the paper saves two message delays by having
123 |   replicas multicast prepares and commits instead of going through
124 |   the primary
125 | 
126 | BFT View change protocol:
127 | - Replicas send S_rep(DOVIEWCHANGE, list of prepared certificates)
128 |   to the *new* primary and stop executing in the current view.
129 | - The new primary collects 2f+1 DOVIEWCHANGE messages and sends
130 |   S_p(NEWVIEW, list of 2f+1 DOVIEWCHANGE messages). It also sends
131 |   PREPREPARE messages for all the requests that were supposed to
132 |   commit in the previous view (i.e., there is a prepared certificate
133 |   for it in one of the DOVIEWCHANGE messages.) This ensures that all
134 |   requests that were supposed to commit in the previous view but
135 |   didn't will be carried forward to the new view.
136 | 
137 | Q: What if the new primary doesn't send the right preprepares?
138 | A: Replicas have to check that the primary sent the right preprepares
139 |    based on the DOVIEWCHANGE messages that came with the NEWVIEW.
140 | 
141 | Q: What if the primary sends different sets of DOVIEWCHANGE messages
142 |    to different replicas?
143 | A: Won't matter; if the req is committed, 2f+1 replicas in the old view
144 |    had prepared certificates for it, so the primary can't come up with
145 |    a set of 2f+1 DOVIEWCHANGE messages that lack that request.
146 | 
147 | Q: Why is this view change protocol shorter than Paxos?
148 | A: Everyone already knows who the primary for view v+1 is going to be,
149 |    so there's nothing to agree on; replicas just need to check that
150 |    the new primary told everyone the right thing.
151 | 
152 | NB: You can make a similar simplification in VR. Labs 7/8 need full Paxos.
153 | 
154 | BFT Recovery protocol (simplified):
155 | - go back to the last checkpoint and roll forward
156 | - execute preprepares from the primary (see view change protocol)
157 | 
158 | Checkpoints
159 | - reduce cost of new views, GC the log
160 | - details painful, affects design of replication and recovery protocols
161 | 
162 | Protocol summary:
163 | - Preprepare informs replicas about a client request
164 | - Prepared certificate (2f+1 prepares) proves that the order
165 |   proposed by the primary is okay (because a quorum was willing
166 |   to prepare it). Does not guarantee that req will survive a VC.
167 | - Commit point is when f+1 non-faulty replicas have a prepared
168 |   certificate. (At least one of them will present the certificate
169 |   to the new primary in a VC.)
170 | - Committed certificate (2f+1 commits) proves that request committed
171 |   in the current view (so can execute it and forget about it at the
172 |   next checkpoint)
173 | 
174 | Performance:
175 | - Table 1: trivial op 4x as expensive as unreplicated. not surprising
176 | - Table 3: BFT FS *faster* than unreplicated NFS. Why?
177 |   - no synchronous writes in the common case. Is this safe?
178 | 
179 | Other optimizations:
180 | - Use hashes of ops if the ops are large
181 | - Use MACs instead of signatures (this is hard; need a different view
182 |   change protocol)
183 | - Fast reads: Client sends read-only request to all; they reply immediately
184 | - Only f+1 replicas execute the request; the rest just agree to the ops
185 | - Batching: If requests come in too fast, combine several requests into one.
186 | 


--------------------------------------------------------------------------------
/original-notes/pbft-2010.txt:
--------------------------------------------------------------------------------
  1 | 6.824 2009 Lecture 19:  Security: Byzantine Fault Tolerance
  2 | 
  3 | Failures in labs 7 and 8:
  4 | - Nodes crash and stop responding
  5 | - Failure detector (heartbeater) to detect failures
  6 |   - detector can make mistakes
  7 |     - Network delays are arbitrary; nothing we can do about this
  8 |   - however, detector will *eventually* remove all failed nodes
  9 |   - this is crucial for the replication protocol to work
 10 | 
 11 | Byzantine failure model:
 12 | - nodes fail in *arbitrary* ways
 13 | - often thought of as ``adversarial'' model
 14 |   - node is compromised, attacker tries to break your protocol
 15 | - can also handle bugs, misconfigurations, etc.
 16 | - as before, must assume uncorrelated failures
 17 |   - design verification + n-version programming
 18 | - *can't* write a failure detector to eventually detect all Byzantine faults
 19 | 
 20 | RSM Protocol from the labs:
 21 | - 3 parts: replication protocol, view changes, recovery
 22 | - Replication protocol:
 23 |   - Primary sends op to all the backups
 24 |   - Backups execute; may have to roll back via state xfer if primary fails
 25 |   - Primary replies to client after hearing from all backups
 26 | - View changes (Paxos)
 27 | - Recovery:
 28 |   - Needed if a view change caused the primary to change
 29 |   - Correctness conditions:
 30 |     - If the client got a reply for a request in the previous view,
 31 |       request must carry forward to this view
 32 |     - All replicas must agree on state of the system
 33 |   - Any backup in the old view knows at least as much as the old primary
 34 |     - pick one, all replicas download state from that backup
 35 | 
 36 | Q: Do we actually need all the replicas to reply in the replication protocol?
 37 | A: No. f+1 responses are enough, but it complicates recovery
 38 |    - need to poll f+1 replicas, and recover from the one that is most
 39 |      up-to-date
 40 |    - viewstamped replication does this
 41 | 
 42 | Today, will show how to adapt this protocol to handle Byzantine faults.
 43 | - BFT protocol is based on viewstamped replication
 44 | - VR (Oki&Liskov 88) is basically the same protocol as the one from the labs,
 45 |   with the f+1 modification discussed above
 46 | 
 47 | How many replicas do we need to handle f fail-stop faults?
 48 | - f+1 will ensure integrity but not availability (e.g., 2PC)
 49 |   - f nodes fail, remaining node still has the data
 50 | - 2f+1 can ensure availability + durability (e.g., Paxos)
 51 |   - f nodes fail, remaining f+1 are a majority and can still make decisions
 52 | 
 53 | How many replicas do we need to handle f Byzantine faults?
 54 | - f+1 won't work at all; f Byzantine nodes can always outvote 1 correct node
 55 | - 2f+1 can preserve integrity *IF* we hear from all 2f+1
 56 |   - does NOT ensure availability
 57 |   - can't wait for last f nodes to reply; they might be Byzantine
 58 |   - why aren't f+1 (matching) replies enough?
 59 |     - example: f=1; replicas A, B, C; A is faulty; x is 0
 60 |       - client 1: write x=1, get replies from A and B
 61 |       - client 2: read x, get replies from A and C (A equivocates, says x=0)
 62 | - 3f+1 replicas preserve integrity and availability (safety + liveness)
 63 |   - use a quorum of 2f+1 replicas for every op (can't wait for the last f)
 64 |   - any two quorums of 2f+1 must intersect in at least one good replica
 65 |     - good replicas will never agree to conflicting values
 66 | 
 67 | Q: How does this compare to SUNDR?
 68 | 
 69 | PBFT attempt 1:
 70 | - Use RSM protocol from lab, fixed size group of 3f+1 replicas
 71 | - Sign all client requests and messages to handle Byzantine nodes
 72 | - Protocol:
 73 |   - Replication protocol:
 74 |     - primary sends op
 75 |     - 2f+1 replicas execute it and reply
 76 |     - primary replies to client with 2f+1 matching responses
 77 |   - View change and recovery protocols:
 78 |     - do view change if it seems the primary isn't making progress
 79 |     - will discuss later
 80 | - Problem: Byzantine primary can send different ops to different replicas
 81 | 
 82 | PBFT attempt 2:
 83 | - nodes don't execute an op until they know that 2f+1 replicas have
 84 |   assigned the same vs to the same op
 85 | - Replication protocol:
 86 |   Client->primary:   S_c(op)
 87 |   Primary->replicas: S_primary(PREPREPARE(S_c(op), vs))
 88 |   Replicas->primary: S_rep(PREPARE(op, vs))
 89 |   Primary->replicas: { set of 2f+1 prepares } = prepared certificate
 90 |   Replicas->Primary: S_rep(REPLY(rep, vs))
 91 |   Primary->Client:   { set of 2f+1 replies }
 92 | 
 93 | Q: What do replicas need to check before they can send a prepare?
 94 | A:
 95 |    - correct view, not in the middle of recovery / view change, etc.
 96 |    - valid signature from client
 97 |    - valid signature from primary
 98 |    - already prepared all requests with lower sequence numbers (why?)
 99 | 
100 | Q: What is the commit point?
101 | A: When f+1 non-faulty replicas have a prepared certificate.
102 |    Need to talk about view changes to understand this.
103 | 
104 | Q: Is this protocol correct?
105 | A: From the client's POV, no problem if it gets 2f+1 replies with
106 |    matching viewstamps. (This proves we reached the commit point.)
107 |    But the replicas have no idea when requests have committed;
108 |    this makes checkpoints / garbage collection impossible.
109 | 
110 | NB: In the lab, we don't worry about GC or concurrent requests;
111 | backups don't care whether the primary executed the op or not.
112 | 
113 | Full PBFT replication protocol:
114 | - Add a commit phase to tell the replicas that the request committed
115 |   in the current view. Replicas send S_rep(COMMIT(op, vs)) to the
116 |   primary when they have a prepared certificate, and the primary
117 |   forwards a set of 2f+1 commits to all the replicas.
118 | 
119 | Differences between what I described and the paper:
120 | - the version I described uses the tentative execution optimization
121 |   (see sec 5.1); similar to lab 8
122 | - the version in the paper saves two message delays by having
123 |   replicas multicast prepares and commits instead of going through
124 |   the primary
125 | 
126 | BFT View change protocol:
127 | - Replicas send S_rep(DOVIEWCHANGE, list of prepared certificates)
128 |   to the *new* primary and stop executing in the current view.
129 | - The new primary collects 2f+1 DOVIEWCHANGE messages and sends
130 |   S_p(NEWVIEW, list of 2f+1 DOVIEWCHANGE messages). It also sends
131 |   PREPREPARE messages for all the requests that were supposed to
132 |   commit in the previous view (i.e., there is a prepared certificate
133 |   for it in one of the DOVIEWCHANGE messages.) This ensures that all
134 |   requests that were supposed to commit in the previous view but
135 |   didn't will be carried forward to the new view.
136 | 
137 | Q: What if the new primary doesn't send the right preprepares?
138 | A: Replicas have to check that the primary sent the right preprepares
139 |    based on the DOVIEWCHANGE messages that came with the NEWVIEW.
140 | 
141 | Q: What if the primary sends different sets of DOVIEWCHANGE messages
142 |    to different replicas?
143 | A: Won't matter; if the req is committed, 2f+1 replicas in the old view
144 |    had prepared certificates for it, so the primary can't come up with
145 |    a set of 2f+1 DOVIEWCHANGE messages that lack that request.
146 | 
147 | Q: Why is this view change protocol shorter than Paxos?
148 | A: Everyone already knows who the primary for view v+1 is going to be,
149 |    so there's nothing to agree on; replicas just need to check that
150 |    the new primary told everyone the right thing.
151 | 
152 | NB: You can make a similar simplification in VR. Labs 7/8 need full Paxos.
153 | 
154 | BFT Recovery protocol (simplified):
155 | - go back to the last checkpoint and roll forward
156 | - execute preprepares from the primary (see view change protocol)
157 | 
158 | Checkpoints
159 | - reduce cost of new views, GC the log
160 | - details painful, affects design of replication and recovery protocols
161 | 
162 | Protocol summary:
163 | - Preprepare informs replicas about a client request
164 | - Prepared certificate (2f+1 prepares) proves that the order
165 |   proposed by the primary is okay (because a quorum was willing
166 |   to prepare it). Does not guarantee that req will survive a VC.
167 | - Commit point is when f+1 non-faulty replicas have a prepared
168 |   certificate. (At least one of them will present the certificate
169 |   to the new primary in a VC.)
170 | - Committed certificate (2f+1 commits) proves that request committed
171 |   in the current view (so can execute it and forget about it at the
172 |   next checkpoint)
173 | 
174 | Performance:
175 | - Table 1: trivial op 4x as expensive as unreplicated. not surprising
176 | - Table 3: BFT FS *faster* than unreplicated NFS. Why?
177 |   - no synchronous writes in the common case. Is this safe?
178 | 
179 | Other optimizations:
180 | - Use hashes of ops if the ops are large
181 | - Use MACs instead of signatures (this is hard; need a different view
182 |   change protocol)
183 | - Fast reads: Client sends read-only request to all; they reply immediately
184 | - Only f+1 replicas execute the request; the rest just agree to the ops
185 | - Batching: If requests come in too fast, combine several requests into one.
186 | 


--------------------------------------------------------------------------------
/original-notes/pbft-2011.txt:
--------------------------------------------------------------------------------
  1 | 6.824 2009 Lecture 18:  Security: Byzantine Fault Tolerance
  2 | 
  3 | so far we have been assuming fail-stop behavior
  4 |   computer/network either executes protocol correctly
  5 |   or halts (and maybe repaired after a while)
  6 | 
  7 | even fail-stop is hard to cope with
  8 |   did the server crash, or is the network broken?
  9 |   is the network broken, or just delaying packets?
 10 |   is the network partitioned?
 11 |   what if fail-stop failures/repair so frequent we can do no work?
 12 |   if system is quiet for a while,
 13 |     ping+paxos will produce a useful configuration
 14 | 
 15 | now we're going to be more ambitious
 16 |   use replication to help with security
 17 | 
 18 | what is Byzantine behavior?
 19 |   a larger set of ways in which a computer can misbehave
 20 |   includes fail-stop faults
 21 |   plus bugs (i.e. incorrect execution)
 22 |   plus intentional malice
 23 |   plus conspiracies -- multiple bad nodes cooperating to trick the good ones
 24 |   (devious, surreptitious)
 25 | 
 26 | what kinds of Byzantine behaviour could cause trouble in the labs?
 27 |   lock server primary gives away same lock to multiple clients
 28 |   lock server backup responds to pings, but does nothing else
 29 |     will be kept in Paxos config, but will prevent progress
 30 |   backup might fake a double-grant from primary,
 31 |     trick system administrator into thinking primary is faulty,
 32 |     thus trick admin into replacing good primary with malicious backup
 33 |   backup DDoSs primary, Paxos causes backup to take over as primary,
 34 |     then gives away locks to multiple clients
 35 | 
 36 | specific assumptions in the paper's Byzantine fault model
 37 |   network can delay, duplicate, re-order, or drop any message
 38 |   "faulty" vs "non-faulty" replicas
 39 |   faulty replicas may be controlled by a malicious attacker
 40 |   the attacker:
 41 |     supplies the code that faulty replicas run
 42 |     knows the code the non-faulty replicas are running
 43 |     can read network messages
 44 |     cannot guess non-faulty replicas' cryptographic keys
 45 |     knows the faulty replicas' keys
 46 |     can force messages to be delayed
 47 | 
 48 | how can we possibly cope with Byzantine replicas?
 49 |   assume no more than some fraction of replicas are faulty
 50 | 
 51 | the basic Byzantine fault tolerance approach
 52 |   replicated state machines
 53 |   ask them all to do each operation
 54 |   compare answers from replicas
 55 |   if enough agree, perhaps that's the correct answer
 56 | 
 57 | straw man 1:
 58 |   [client, n servers]
 59 |   n servers
 60 |   client sends request to all of them
 61 |   waits for all n to reply
 62 |   only proceeds if all n agree
 63 |   why might this not work well?
 64 |     faulty replica can stop progress by disagreeing
 65 | 
 66 | straw man 2:
 67 |   let's have replicas vote
 68 |   2f+1 servers, assume no more than f are faulty
 69 |   if client gets f+1 matching replies, success
 70 |     otherwise, failure
 71 |   why might this not work well?
 72 |     client can't wait for replies from the last f replicas
 73 |       they might be faulty, never going to reply
 74 |     so must be able to make a decision after n-f replies, or f+1 if n=2f+1
 75 |     but f of the first f+1 replies might be from faulty replicas!
 76 |       i.e. f+1 is not enough to vote
 77 |       also waiting for f+1 of 2f+1 doesn't ensure that majority of good nodes executed
 78 | 
 79 | straw man 3:
 80 |   3f+1 servers, of which at most f are faulty
 81 |   client waits for 2f+1 replies
 82 |   client takes majority of those 2f+1
 83 |   why does this work? informally...
 84 |     client will get at least 2f+1 replies: all non-faulty replicas eventually respond
 85 |     at most f of 2f+1 are faulty
 86 |     the f+1 non-faulty replicas will agree on the answer
 87 |  
 88 | what about handling multiple clients?
 89 |   non-faulty replicas must process operations in the same order!
 90 | 
 91 | let's use a primary to choose operation order
 92 |   picks an order and assigns sequence numbers
 93 |   for now, assume primary is non-faulty
 94 | 
 95 | straw man 4: with primary
 96 |   pick sequence #
 97 |   send *signed* message to each replica (including itself)
 98 |     from here on, every message signed by sender using public key crypto
 99 |   wait for 2f+1 replies
100 |   f+1 must match
101 |   reply to client
102 | 
103 |   REQ  MSG  REP  REP
104 | C
105 | 0
106 | 1
107 | 2
108 | 3
109 | 
110 | what if the primary is faulty?
111 |   it might send different operations to different replicas
112 |   or send them in different orders to different replicas
113 |   or send a wrong result to the client
114 |   or do nothing
115 | 
116 | general approach to handling faulty primary
117 |   clients notify replicas of each operation, as well as primary
118 |   each replica watches progress of each operation
119 |   if no progress, force view change -> new primary
120 |   view change must sort out state of last operation
121 | 
122 | can a replica execute an operation when it first receives it from primary?
123 |   no: maybe primary gave different ops to different replicas
124 |   if we execute before we're sure, we've wrecked the replica's state
125 |   so we need a second round of messages to make sure all good replicas got the same op
126 | 
127 | straw man 5:
128 |   3f+1 servers, one is primary, f faulty, primary might be faulty
129 |   client sends request to primary AND to each replica
130 |   primary sends PRE-PREPARE(op, n) to replicas
131 |   each replica sends PREPARE(op, n) to all replicas
132 |   if replica gets PREPARE(op, n) from 2f+1 replicas (incl itself)
133 |     (with same op and n)
134 |     execute the operation, possibly modifying state
135 |     send reply to client
136 |   client is happy when it gets f+1 matching replies
137 | 
138 |   REQ  PRE-P  PREPARE  REPLY
139 | C
140 | 0
141 | 1
142 | 2
143 | 3
144 | 
145 | does this cope with the primary sending different ops to different replicas?
146 |   yes: replicas won't see 2f+1 matching PREPAREs
147 |   also handles primary assigning different sequence #s
148 |   result: system will stop
149 | 
150 | does this cope with the primary lying to the client about the reply?
151 |   yes: each replica sends a reply, not the primary
152 |   result: system will stop
153 | 
154 | does this cope with the primary not sending operations to replicas?
155 |   yes: replicas receive request from the client
156 |     but don't see a PRE-PREPARE, or don't see a PREPARE
157 |   result: system will stop
158 | 
159 | how to resume operation after faulty client?
160 |   need a view change to choose new primary
161 |   view change does *not* choose a set of "live" servers
162 |     2f+1 of 3f+1 deals with faulty servers
163 | 
164 | when to trigger a view change?
165 |   if a replica sees a client op but doesn't see 2f+1 matching PREPAREs
166 | 
167 | if faulty replicas try to trigger a view change to shoot down non-faulty primary?
168 |   require view change requests from 2f+1 replicas
169 |   
170 | who is the next primary?
171 |   need to make sure faulty replicas can't always make themselves next primary
172 |   view number v
173 |   primary is v mod n
174 |   so primary rotates among servers
175 |   at most f faulty primaries in a row
176 | 
177 | view change straw man
178 |   replicas send VIEW-CHANGE requests to *new* primary
179 |   new primary waits for 2f+1 view-change requests
180 |   new primary announces view change w/ NEW-VIEW
181 |     includes the 2f+1 VIEW-CHANGE requests
182 |     as proof that enough replicas wanted to change views
183 |   new primary starts numbering operations at last n it saw + 1
184 | 
185 | will all non-faulty replicas agree about operation numbering across view change?
186 | 
187 | problem:
188 |   I saw 2f+1 PREPAREs for operation n, so I executed it
189 |   new primary did not, so it did not execute it
190 |   maybe new primary didn't even see the PRE-PREPARE for operation n
191 |     since old primary only waited for 2f+1 replies
192 |     or old primary may never have sent PRE-PREARE to next primary
193 |   thus new primary may start numbering at n, yielding two different op #n
194 | 
195 | can new primary ask all replicas for set of operations they have executed?
196 |   doesn't work: new primary can only wait for 2f+1 replies
197 |     faulty replicas may reply, so new primary may not wait for me
198 | 
199 | solution:
200 |   don't execute operation until sure a new primary will hear about it
201 |   add a third phase: PRE-PREPARE, PREPARE, then COMMIT
202 |   only execute after commit
203 | 
204 | operation protocol:
205 |   client sends op to primary
206 |   primary sends PRE-PREPARE(op, n) to all
207 |   all send PREPARE(op, n) to all
208 |   after replica receives 2f+1 matching PREPARE(op, n)
209 |     send COMMIT(op, n) to all
210 |   after receiving 2f+1 matching COMMIT(op, n)
211 |     execute op
212 | 
213 | view change:
214 |   each replica sends new primary 2f+1 PREPAREs for recent ops (if it has them)
215 |   new primary waits for 2f+1 VIEW-CHANGE requests
216 |   new primary sends NEW-VIEW msg to all replicas with
217 |     complete set of VIEW-CHANGE msgs
218 |     list of every op for which some VIEW-CHANGE contained 2f+1 PREPAREs
219 |     i.e. list of final ops from last view
220 | 
221 | informal argument why if a replica executes an op, new primary will know of that op
222 |   replica only executed after receiving 2f+1 COMMITS
223 |   maybe f of those were lies, from faulty replicas, who won't tell new primary
224 |   but f+1 COMMITs were from replicas that got matching PREPAREs from 2f+1 replicas
225 |   new primary waits for view-change requests from 2f+1 replicas
226 |   at least 1 of those f+1 will report PREPARE set to the new primary
227 | 
228 | can a replica trick new primary into believing a manufactured op?
229 |   no -- replica must present 2f+1 PREPAREs for that op
230 |   the PREPAREs must have the original signatures
231 |   so the non-faulty nodes must have seen that op
232 | 
233 | can the new primary omit some of the reported recent operations?
234 |   replicas must check that the primary announced the right set
235 |   compare against NEW-VIEW PREPARE info included in the VIEW-CHANGE
236 | 
237 | paper also discusses
238 |   checkpoints and logs to help good nodes recover
239 |   various cryptographic optimizations
240 |   optimizations to reduce # of msgs in common case
241 |   fast read-only operations
242 | 
243 | what are the consequences of more than f corrupt servers?
244 |   can the system recover?
245 | 
246 | suppose a single server has a fail-stop fault (e.g. powered off)
247 |   can the system still survive f additional malicious faulty nodes?
248 |   will it be live?
249 |   can bad nodes trick it into executing incorrect operations?
250 | 
251 | what if the client is corrupt?
252 | 
253 | suppose we had a technique to provide Byzantine fault tolerance (BFT)
254 |   would it be useful to apply it to a set of identical servers?
255 |   how about non-identical servers run by same organization?
256 | 


--------------------------------------------------------------------------------
/original-notes/pbft-2012.txt:
--------------------------------------------------------------------------------
  1 | 6.824 2012 Lecture 17:  Security: Byzantine Fault Tolerance
  2 | 
  3 | reminder: start thinking about projects, groups of 2/3
  4 | 
  5 | we've considered many fault-tolerance protocols
  6 |   have always assumed "fail-stop" failures -- like power failure
  7 |   i.e. servers follow the protocol
  8 |   hard enough: crash vs network down; network partition
  9 | 
 10 | can one handle a larger class of failures?
 11 |   buggy servers, that compute incorrectly rather than stopping?
 12 |   servers that *don't* follow the protocol?
 13 |   servers that have been modified by an attacker?
 14 |   often called "Byzantine" faults
 15 | 
 16 | the paper's approach:
 17 |   replicated state machine
 18 |   assumes 2f+1 of 3f+1 are non-faulty
 19 |   use voting to select the right results
 20 |   not as easy as it might sound
 21 | 
 22 | let's assume the worst case:
 23 |   a single attacker controls the f faulty replicas
 24 |   and is actively trying to break the system
 25 |   if we can handle this, we can handle bugs in f replicas too
 26 | 
 27 | what are the attacker's powers?
 28 |   supplies the code that faulty replicas run
 29 |   knows the code the non-faulty replicas are running
 30 |   knows the faulty replicas' crypto keys
 31 |   can read network messages
 32 |   can temporarily force messages to be delayed via DoS
 33 | 
 34 | what faults *can't* happen?
 35 |   no more than f out of 3f+1 replicas can be faulty
 36 |   no client failure -- clients never do anything bad
 37 |   no guessing of crypto keys or breaking of cryptography
 38 | 
 39 | example use scenario:
 40 |   RM:
 41 |     echo A > grade
 42 |     echo B > grade
 43 |     tell YM "the grade file is ready"
 44 |   YM:
 45 |     cat grade
 46 | 
 47 | a faulty system could:
 48 |   totally make up the file contents
 49 |   execute write("A") but ignore write("B")
 50 |   show "B" to RM and "A" to YM
 51 |   execute write("B") only only some of the replicas
 52 | 
 53 | let's try to design our own byzantine-fault-tolerant RSM
 54 |   start simple (and broken), work towards paper's design
 55 | 
 56 | design 1:
 57 |   [client, n servers]
 58 |   n servers
 59 |   client sends request to all of them
 60 |   waits for all n to reply
 61 |   only proceeds if all n agree
 62 | 
 63 | what's wrong with design 1?
 64 |   one faulty replica can stop progress by disagreeing
 65 | 
 66 | design 2:
 67 |   let's have replicas vote
 68 |   2f+1 servers, assume no more than f are faulty
 69 |   client waits for f+1 matching replies
 70 |     if only f are faulty, and network works eventually, must get them!
 71 | 
 72 | what's wrong with design 2's 2f+1?
 73 |   f+1 matching replies might be f bad nodes and just 1 good
 74 |     so maybe only one good node got the operation!
 75 |   *next* operation also waits for f+1
 76 |     might *not* include that one good node that saw op1
 77 |   example:
 78 |     S1 S2 S3 (S1 is bad)
 79 |     everyone hears and replies to write("A")
 80 |     S1 and S2 reply to write("B"), but S3 misses it
 81 |       client can't wait for S3 since it may be the one faulty server
 82 |     S1 and S3 reply to read(), but S2 misses it
 83 |     so read() yields "A"
 84 |   result: client tricked into accepting a reply based on out-of-date state
 85 |     e.g. TA reads A instead of B from grades file
 86 | 
 87 | design 3:
 88 |   3f+1 servers, of which at most f are faulty
 89 |   client waits for 2f+1 matching replies
 90 |     == f bad nodes plus a majority of the good nodes
 91 |     so all sets of 2f+1 overlap in at least one good node
 92 |   example:
 93 |     S1 S2 S3 S4 (S1 is bad)
 94 |     everyone hears write("A")
 95 |     S1, S2, S3 process write("B"), S4 misses it
 96 |     now the read()
 97 |       client will wait for 2f+1=3 matching replies
 98 |       S1 and S4 will reply "A"
 99 |       S2 and S3 will reply "B"
100 |     client doesn't know what to believe (neither is 2f+1)
101 |       but it is guaranteed to see there's a problem
102 |   so client can *detect* that some good nodes missed an operation
103 |     we'll see how to repair in a bit
104 |  
105 | what about handling multiple clients?
106 |   non-faulty replicas must process operations in the same order!
107 | 
108 | let's have a primary to pick order for concurrent client requests
109 |   but we have to worry about a faulty primary
110 | 
111 | what can a faulty primary do?
112 |   1. send wrong result to client
113 |   2. different ops to different replicas
114 |   3. ignore a client op
115 | 
116 | general approach to handling faulty primary
117 |   1. replicas send results direct to client
118 |   2. replicas exchange info about ops sent by primary
119 |   3. clients notify replicas of each operation, as well as primary
120 |      each replica watches progress of each operation
121 |      if no progress, force change of primary
122 | 
123 | can a replica execute an operation when it first receives it from primary?
124 |   no: maybe primary gave different ops to different replicas
125 |   if we execute before we're sure, we've wrecked the replica's state
126 |   need 2nd round of messages to make sure all good replicas got the same op
127 | 
128 | design 4:
129 |   3f+1 servers, one is primary, f faulty, primary might be faulty
130 |   client sends request to primary AND to each replica
131 |   primary chooses next op and op #
132 |   primary sends PRE-PREPARE(op, n) to replicas
133 |   each replica sends PREPARE(op, n) to all replicas
134 |   if replica gets matching PREPARE(op, n) from 2f+1 replicas (incl itself)
135 |     and n is the next operation #
136 |     execute the operation, possibly modifying state
137 |     send reply to client
138 |   else:
139 |     keep waiting
140 |   client is happy when it gets f+1 matching replies
141 | 
142 |   REQ  PRE-P  PREPARE  REPLY
143 | C
144 | 0
145 | 1
146 | 2
147 | 3
148 | 
149 | remember our strategy:
150 |   primary follows protocol => progress
151 |   no progress => replicas detect and force change of primary
152 | 
153 | if the primary is non-faulty, can faulty replicas prevent correct progress?
154 |   they can't forge primary msgs
155 |   they can delay msgs, but not forever
156 |   they can do nothing: but they aren't needed for 2f+1 matching PREPAREs
157 |   they can send correct PREPAREs
158 |     and DoS f good replicas to prevent them from hearing ops
159 |     but those replicas will eventually hear the ops from the primary
160 |   worst outcome: delays
161 | 
162 | if the primary is faulty, will replicas detect any problem?
163 |   or can primary cause undetectable problem?
164 |   primary can't forge client ops -- signed
165 |   it can't ignore client ops -- client sends to all replicas
166 |   it can try to send in different order to different replicas,
167 |     or try to trick replicas into thinking an op has been
168 |     processed even though it hasn't
169 |   will replicas detect such an attack?
170 | 
171 | results of the primary sending diff ops to diff replicas?
172 |   case 1: all good nodes get 2f+1 matching PREPAREs
173 |     did they all get the same op?
174 |     yes: everyone who got 2f+1 matching PREPAREs must have gotten same op
175 |       since any two sets of 2f+1 share at least one good server
176 |     result: all good nodes will execute op2, client happy
177 |   case 2: >= f+1 good nodes get 2f+1 matching PREPARES
178 |     again, no disagreement possible
179 |     result: f+1 good nodes will execute op, client happy
180 |     BUT up to f good nodes don't execute
181 |       can they be used to effectively roll back the op?
182 |       i.e. send the write("B") to f+1, send read() to remaining f
183 |       no: won't be able to find 2f+1 replicas with old state
184 |       so no enough PREPAREs
185 |   case 3: < f+1 good nodes get 2f+1 matching PREPAREs
186 |     result: client never gets a reply
187 |     result: system will stop, since f+1 stuck waiting for this op
188 | 
189 | how to resume operation after faulty primary?
190 |   need a view change to choose new primary
191 |   (this view change only chooses primary; no notion of set of live servers)
192 | 
193 | when does a replica ask for a view change?
194 |   if it sees a client op but doesn't see 2f+1 matching PREPAREs
195 |   after some timeout period
196 | 
197 | is it OK to trigger a view change if just one replica asks?
198 |   no: faulty replicas might cause constant view changes
199 | 
200 | let's defer the question of how many replicas must ask for
201 |   a view change
202 |   
203 | who is the next primary?
204 |   need to make sure faulty replicas can't always make themselves next primary
205 |   view number v
206 |   primary is v mod n
207 |   so primary rotates among servers
208 |   at most f faulty primaries in a row
209 | 
210 | view change design 1 (not correct)
211 |   replicas send VIEW-CHANGE requests to *new* primary
212 |   new primary waits for enough view-change requests
213 |   new primary announces view change w/ NEW-VIEW
214 |     includes the VIEW-CHANGE requests
215 |     as proof that enough replicas wanted to change views
216 |   new primary starts numbering operations at last n it saw + 1
217 | 
218 | will all non-faulty replicas agree about operation numbering across view change?
219 | 
220 | problem:
221 |   I saw 2f+1 PREPAREs for operation n, so I executed it
222 |   new primary did not, so it did not execute it
223 |   thus new primary may start numbering at n, yielding two different op #n
224 | 
225 | can new primary ask all replicas for set of operations they have executed?
226 |   doesn't work: new primary can only wait for 2f+1 replies
227 |     faulty replicas may reply, so new primary may not wait for me
228 | 
229 | solution:
230 |   don't execute operation until sure a new primary will hear about it
231 |   add a third phase: PRE-PREPARE, PREPARE, then COMMIT
232 |   only execute after commit
233 | 
234 | operation protocol:
235 |   client sends op to primary
236 |   primary sends PRE-PREPARE(op, n) to all
237 |   all send PREPARE(op, n) to all
238 |   after replica receives 2f+1 matching PREPARE(op, n)
239 |     send COMMIT(op, n) to all
240 |   after receiving 2f+1 matching COMMIT(op, n)
241 |     execute op
242 | 
243 | view change:
244 |   each replica sends new primary 2f+1 PREPAREs for recent ops
245 |   new primary waits for 2f+1 VIEW-CHANGE requests
246 |   new primary sends NEW-VIEW msg to all replicas with
247 |     complete set of VIEW-CHANGE msgs
248 |     list of every op for which some VIEW-CHANGE contained 2f+1 PREPAREs
249 |     i.e. list of final ops from last view
250 | 
251 | if a replica executes an op, will new primary will know of that op?
252 |   replica only executed after receiving 2f+1 COMMITS
253 |   maybe f of those were lies, from faulty replicas, who won't tell new primary
254 |   but f+1 COMMITs were from replicas that got 2f+1 matching PREPAREs
255 |   new primary waits for view-change requests from 2f+1 replicas
256 |     ignoring the f faulty nodes
257 |     f+1 sent COMMITs, f+1 sent VIEW-CHANGE
258 |     must overlap
259 | 
260 | can the new primary omit some of the reported recent operations?
261 |   no, NEW-VIEW must include signed VIEW-CHANGE messages
262 | 
263 | paper also discusses
264 |   checkpoints and logs to help good nodes recover
265 |   various cryptographic optimizations
266 |   optimizations to reduce # of msgs in common case
267 |   fast read-only operations
268 | 
269 | what are the consequences of more than f corrupt servers?
270 |   can the system recover?
271 | 
272 | what if the client is corrupt?
273 | 
274 | suppose an attacker can corrupt one of the servers
275 |   exploits a bug, or steals a password, or has physical access, &c
276 |   why can't the attacker corrupt them all?
277 | 


--------------------------------------------------------------------------------
/original-notes/pbft.ppt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/original-notes/pbft.ppt


--------------------------------------------------------------------------------
/papers/.htaccess:
--------------------------------------------------------------------------------
1 | # Protect the htaccess file
2 | <Files .htaccess>
3 | Order Allow,Deny
4 | Deny from all
5 | </Files>
6 | 
7 | # Enable directory browsing
8 | Options All Indexes
9 | 


--------------------------------------------------------------------------------
/papers/akamai.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/papers/akamai.pdf


--------------------------------------------------------------------------------
/papers/argus88.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/papers/argus88.pdf


--------------------------------------------------------------------------------
/papers/bayou-conflicts.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/papers/bayou-conflicts.pdf


--------------------------------------------------------------------------------
/papers/bitcoin.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/papers/bitcoin.pdf


--------------------------------------------------------------------------------
/papers/bliskov-harp.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/papers/bliskov-harp.pdf


--------------------------------------------------------------------------------
/papers/cooper-pnuts.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/papers/cooper-pnuts.pdf


--------------------------------------------------------------------------------
/papers/dht-9-per-page.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/papers/dht-9-per-page.pdf


--------------------------------------------------------------------------------
/papers/dht.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/papers/dht.pdf


--------------------------------------------------------------------------------
/papers/dynamo.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/papers/dynamo.pdf


--------------------------------------------------------------------------------
/papers/fds.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/papers/fds.pdf


--------------------------------------------------------------------------------
/papers/ficus.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/papers/ficus.pdf


--------------------------------------------------------------------------------
/papers/flp.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/papers/flp.pdf


--------------------------------------------------------------------------------
/papers/guardians-and-actions-liskov.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/papers/guardians-and-actions-liskov.pdf


--------------------------------------------------------------------------------
/papers/kademlia.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/papers/kademlia.pdf


--------------------------------------------------------------------------------
/papers/katabi-analogicfs.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/papers/katabi-analogicfs.pdf


--------------------------------------------------------------------------------
/papers/keleher-treadmarks.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/papers/keleher-treadmarks.pdf


--------------------------------------------------------------------------------
/papers/li-dsm.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/papers/li-dsm.pdf


--------------------------------------------------------------------------------
/papers/mapreduce.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/papers/mapreduce.pdf


--------------------------------------------------------------------------------
/papers/memcache-fb.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/papers/memcache-fb.pdf


--------------------------------------------------------------------------------
/papers/paxos-simple.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/papers/paxos-simple.pdf


--------------------------------------------------------------------------------
/papers/raft-atc14.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/papers/raft-atc14.pdf


--------------------------------------------------------------------------------
/papers/remus.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/papers/remus.pdf


--------------------------------------------------------------------------------
/papers/spanner.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/papers/spanner.pdf


--------------------------------------------------------------------------------
/papers/zaharia-spark.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/papers/zaharia-spark.pdf


--------------------------------------------------------------------------------
/paxos-algorithm.html:
--------------------------------------------------------------------------------
 1 | <p>It's magic!</p>
 2 | 
 3 | <p>
 4 | <pre>
 5 |     --- Paxos Proposer ---
 6 | 
 7 |     1  proposer(v):
 8 |     2    choose n, unique and higher than any n seen so far
 9 |     3    send prepare(n) to all servers including self
10 |     4    if prepare_ok(n, n_a, v_a) from majority:
11 |     5      v' = v_a with highest n_a; choose own v otherwise
12 |     6      send accept(n, v') to all
13 |     7      if accept_ok(n) from majority:
14 |     8        send decided(v') to all
15 | 
16 |     --- Paxos Acceptor ---
17 | 
18 |     acceptor state:
19 |       must persist across reboots
20 |       n_p (highest prepare seen)
21 |       n_a, v_a (highest accept seen)
22 | 
23 |     acceptor's prepare handler:
24 | 
25 |     10  prepare(n):
26 |     11    if n > n_p
27 |     12      n_p = n
28 |     13      reply prepare_ok(n, n_a, v_a)
29 |     14    else
30 |     15      reply prepare_reject
31 | 
32 |     acceptor's accept(n, v) handler:
33 | 
34 |     16  accept(n, v):
35 |     17    if n >= n_p
36 |     18      n_p = n
37 |     19      n_a = n
38 |     20      v_a = v
39 |     21      reply accept_ok(n)
40 |     22    else
41 |     23      reply accept_reject
42 | 
43 | </pre>
44 | </p>
45 | 


--------------------------------------------------------------------------------
/stumbled/flp-consensus.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/stumbled/flp-consensus.pdf


--------------------------------------------------------------------------------
/stumbled/paxos-explained-from-scratch.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alinush/6.824-lecture-notes/d836c8f5b76b1b1ca9e6e2be1e9a2057a3160d67/stumbled/paxos-explained-from-scratch.pdf


--------------------------------------------------------------------------------
/template.html:
--------------------------------------------------------------------------------
1 | <h1>6.824 2015 Lecture XX: TTTTTTTTT</h1>
2 | 
3 | <p><strong>Note:</strong> These lecture notes were slightly modified from the ones posted on the
4 | 6.824 <a href="http://nil.csail.mit.edu/6.824/2015/schedule.html">course website</a> from 
5 | Spring 2015.</p>
6 | 


--------------------------------------------------------------------------------
/template.md:
--------------------------------------------------------------------------------
1 | 6.824 2015 Lecture XX: TTTTTTTTT
2 | ================================
3 | 
4 | **Note:** These lecture notes were slightly modified from the ones posted on the
5 | 6.824 [course website](http://nil.csail.mit.edu/6.824/2015/schedule.html) from 
6 | Spring 2015.
7 | 
8 | 


--------------------------------------------------------------------------------