├── .travis.yml
├── LICENSE
├── README.md
├── centralized.js
├── demos.js
├── grid.js
├── hops.js
├── images
├── redundant.svg
├── spanning-with-redundant.svg
├── spanning-without-redundant.svg
└── test.svg
├── index.html
├── measurements
└── overlap.js
├── message-passing.jpg
├── message-passing2.jpg
├── package.json
├── paper.md
├── random.js
├── references.md
├── repair-lazy.jpg
└── simulation
├── connected.js
├── flood.js
├── friends.js
└── spanning.js
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: node_js
2 | node_js:
3 | - 0.6
4 | - 0.8
5 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2017 'Dominic Tarr'
2 |
3 | Permission is hereby granted, free of charge,
4 | to any person obtaining a copy of this software and
5 | associated documentation files (the "Software"), to
6 | deal in the Software without restriction, including
7 | without limitation the rights to use, copy, modify,
8 | merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom
10 | the Software is furnished to do so,
11 | subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice
14 | shall be included in all copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
18 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR
20 | ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Scalable Secure Scuttlebutt
2 |
3 | * @dominictarr
4 |
5 | * scuttlebutt.nz
6 |
7 | ---
8 |
9 | # Scuttlebutt
10 |
11 | "anti-entropy" gossip protocol, part of amazon dynamo
12 |
13 | * "flow control for anti-entropy protocols"
14 | * eventually consistent
15 | * used by within a trusted system
16 |
17 | ---
18 |
19 | # Secure Scuttlebutt
20 |
21 | * peer id is public key
22 | * data model signed hash chains
23 | * peers can relay other's messages
24 | * cannot insert, modify, or reorder messages
25 | * large objects in optional "attachments"
26 |
27 | ---
28 |
29 | * subscribe ("follow") model solves sybil attacks
30 | * provides social discovery
31 | * messages arrive in natural order - easy to build database on top of
32 |
33 | ---
34 |
35 | # Scalable Secure Scuttlebutt
36 |
37 | * optimized for bandwidth and latency
38 | * overhead proportional to feeds updated
39 |
40 | ---
41 |
42 | # part 1: network topology
43 |
44 | ---
45 |
46 | # Star Network
47 |
48 | naive design: connect to everyone
49 |
50 | ---
51 |
52 | ``` js_exe
53 | return require('./demos').centralized()
54 | ```
55 |
56 | ---
57 |
58 | # Random Network
59 |
60 | * connect to N > 2 peers ~= fully connected
61 | * tradeoff between bandwidth & latency
62 |
63 | ---
64 |
65 | ``` js_exe
66 | return require('./demos').random()
67 | ```
68 |
69 | ---
70 |
71 | # Rules for gossip flooding
72 |
73 | send _new_ messages to everyone that didn't send it to you
74 |
75 | (don't resend messages you already know!)
76 |
77 | ---
78 |
79 | # message passing diagram
80 |
81 | 
82 |
83 | ---
84 |
85 | # bandwidth complexity
86 |
87 | ```
88 | O(messages*(spanning+2*redundant))
89 | ```
90 | ---
91 |
92 | # how many redundant connections
93 |
94 | * 1: 0%
95 | * 2: 66%
96 | * 3: 80%
97 | * 4: 86%
98 | * 5: 89%
99 | * 10: 95%
100 |
101 | at 5 connections per peer, we use 9 times as much bandwidth
102 | ---
103 |
104 | # Spanning Tree Network
105 |
106 | remove duplicate connections
107 |
108 | * when you receive an old message, disable that connection
109 | * bandwidth complexity now approaches O(messages)
110 |
111 | ---
112 |
113 | ``` js_exe
114 | return require('./demos').spanning()
115 | ```
116 |
117 | ---
118 |
119 | # trees are fragile
120 |
121 | ``` js_exe
122 | return require('./demos').fragile()
123 | ```
124 |
125 |
126 | ---
127 |
128 | # spanning tree
129 |
130 | * network partitions are really easy
131 | * we just added many points of failure!
132 |
133 | ---
134 |
135 | # Epidemic Broadcast Trees
136 |
137 | * (from paper with the same name)
138 |
139 | * best of flooding and spanning
140 | * we run a separate tree per feed
141 | ---
142 |
143 | # Eager and Lazy (aka push and pull)
144 |
145 | * connections have two modes: **eager** and **lazy**
146 | * switch redundant connections into **lazy**
147 | * **eager** mode pushes message immediately
148 | * **lazy** mode sends short _note_ that message exists
149 |
150 | ---
151 |
152 | * if you receive a known message, ask them to be **lazy**
153 | * (if in **lazy** mode, just send a short note)
154 | * if you receive a note for a new message,
155 | switch connection back to **eager**
156 |
157 | ---
158 |
159 | 
160 |
161 | ---
162 |
163 | # state per feed
164 |
165 | * feed id (string)
166 | * local sequence (integer)
167 | * remote sequence (integer)
168 | * local request (integer)
169 | * remote request (integer)
170 | * local mode (boolean)
171 | * remote mode (boolean)
172 |
173 | ---
174 |
175 | # part 2: handshakes
176 |
177 | ---
178 |
179 | # basic vector clock
180 |
181 | (as described in flow gossip paper)
182 |
183 | ```
184 | Alice: {A: 1, B: 0, C: 3}
185 |
186 | Bob: {A: 0, B: 1, C: 3}
187 | ```
188 | alice sends A1, Bob sends B1
189 | handshake size is O(feeds)
190 |
191 | ---
192 |
193 | # skipping vector clock
194 |
195 | Alice stores Bob's last vector clock
196 |
197 | ```
198 | Alice (local): {A: 2, B: 2, C: 3}
199 |
200 | Bob (stored): {A: 0, B: 1, C: 3}
201 | ```
202 | drops elements which havn't changed
203 | (Carol hasn't updated)
204 |
205 | ---
206 |
207 | > Note: Alice stores Bob's A:0
208 | even though she sent A1.
209 | unless Bob acknowledges A1,
210 | Alice doesn't know for sure he has it.
211 |
212 | ---
213 |
214 | # so Alice sends:
215 |
216 | ```
217 | Alice: {A: 2, B: 2}
218 |
219 | Bob: {A: 1, B: 3}
220 | ```
221 | Alice sends A2, Bob sends B3
222 |
223 | ---
224 | # if Bob then gossips with Carol
225 |
226 | ```
227 | Alice: {}
228 |
229 | Bob: {C: 4}
230 | ```
231 |
232 | Alice sends an empty clock,
233 | because she thinks nothing has
234 | changed
235 | ---
236 | # but Bob has an update from Carol
237 |
238 | ``` js
239 | Alice: {C: 3}
240 | ```
241 | and Bob sends C4
242 |
243 | so, it costs one more message pass,
244 | but usually saves lots of requests.
245 |
246 | > note, we also save tracking this in memory
247 |
248 | ---
249 |
250 | # distribution of updates
251 |
252 | * power law is typical
253 | * a few active users
254 | * some moderate users
255 | * many occasional / infrequent users
256 |
257 | ---
258 |
259 | * basic scuttlebutt
260 |
261 | O(messages + feeds)
262 |
263 | * scalable scuttlebutt
264 |
265 | O(messages + updated_feeds)
266 |
267 | (updated_users is probably much smaller)
268 |
269 | reconnections are very cheap!
270 |
271 | ---
272 |
273 |
274 | # complexity overview
275 |
276 | bandwidth: O(messages + updated_feeds)
277 |
278 | latency: replication in 1 or 2 round trips
279 |
280 | ---
281 |
282 | # Part 3: Comparison to IPFS
283 |
284 | ---
285 |
286 | * replicate data without global consistency
287 | * secure scuttlebutt optimizes for chains
288 | * this is worst case for ipfs
289 |
290 | ---
291 |
292 | * ssb streams updates in order, per feed
293 | * ipfs has pointer to latest message
294 | then works backwards, one message at a time
295 |
296 | ---
297 |
298 | * ipfs better for arbitary partial datasets
299 | * ssb better for social applications
300 | * ssb: database vs ipfs: file system
301 |
302 | ---
303 |
304 | # Thank you!
305 |
306 | * learn more: https://scuttlebutt.nz
307 | * frontend: https://github.com/ssbc/patchwork
308 | * backend: https://github.com/ssbc/scuttlebot
309 |
310 | ---
311 |
312 |
313 |
314 |
315 |
316 |
--------------------------------------------------------------------------------
/centralized.js:
--------------------------------------------------------------------------------
1 | var el = require('./demos').centralized()
2 | document.body.appendChild(el)
3 |
4 | el.dispatchEvent(new FocusEvent('focus'))
5 |
--------------------------------------------------------------------------------
/demos.js:
--------------------------------------------------------------------------------
1 | var h = require('hyperscript')
2 | var random = require('network-animator/random')
3 | var animate = require('network-animator/animate')
4 | var G = require('graphreduce')
5 | var h = require('hyperscript')
6 | var random = require('network-animator/random')
7 |
8 | var W = 600, H = 600
9 |
10 | function delay (fn) {
11 | setTimeout(fn, 1000)
12 | }
13 |
14 | function canvas () {
15 | return h('canvas', {width: W, height: H})
16 | }
17 |
18 | function starter (fn) {
19 | return function () {
20 | return fn(()=>false)
21 |
22 | }
23 | var stopped = false
24 | function stop () {
25 | return stopped
26 | }
27 | return function () {
28 | var d = h('div')
29 | d.addEventListener('focus', function () {
30 | stopped = false
31 | d.innerHTML = ''
32 | d.appendChild(fn(stop))
33 | })
34 | d.addEventListener('blur', function () {
35 | stopped = true
36 | })
37 | return d
38 | }
39 | }
40 |
41 | function circular (n, center, radius) {
42 | var a = {}, theta = Math.PI*2/n
43 | for(var i = 0; i < n; i++)
44 | a[i] = {x: center.x + Math.cos(i*theta)*radius, y: center.y + Math.sin(i*theta)*radius}
45 | return a
46 | }
47 |
48 | function central (N, center, radius) {
49 | var loc = circular(N-1, center, radius)
50 |
51 | for(var i = N - 1; i > 0; i--)
52 | loc[i] = loc[i-1]
53 |
54 | loc[0] = center
55 | return loc
56 | }
57 |
58 | exports.centralized = starter(function (stop) {
59 | var c = canvas() //
60 | var g = {}
61 | var N = 200
62 | for(var i = 0; i < N; i++) {
63 | G.addEdge(g, 0, i)
64 | G.addEdge(g, i, 0)
65 | }
66 |
67 | // var loc = random.locations(N, c.width, c.height)
68 | var loc = central(N, {x:c.width/2, y:c.height/2}, Math.min(c.width/2, c.height/2)*0.9)
69 | var label = h('label')
70 |
71 | animate(g, loc, c.getContext('2d'), function next (_, packets) {
72 | var e = 0
73 | for(var i in packets)
74 | if(packets[i].extra) {
75 | e ++
76 | delete g[packets[i].from][packets[i].to]
77 | delete g[packets[i].to][packets[i].from]
78 | }
79 | label.textContent = e + '/' + packets.length
80 | delay(function () { animate(g, loc, c.getContext('2d'), next) })
81 | })
82 | return h('div', c, label)
83 | })
84 |
85 | exports.random = starter(function (stop) {
86 |
87 | var c = canvas() //h('canvas', {width: W, height: H})
88 | var N = 200
89 | var g = random.graph(N, 3)
90 | var loc = random.locations(200, c.width, c.height)
91 | // var loc = circular(N, {x:c.width/2, y:c.height/2}, Math.min(c.width/2, c.height/2)*0.9)
92 | var label = h('label')
93 |
94 | animate(g, loc, c.getContext('2d'), function next (_, packets) {
95 | var e = 0
96 | for(var i in packets)
97 | if(packets[i].extra) e ++
98 | label.textContent = e + '/' + packets.length
99 | delay(function () { animate(g, loc, c.getContext('2d'), next) })
100 | })
101 | return h('div', c, label)
102 | })
103 |
104 | exports.grid = starter(function (stop) {
105 |
106 | var c = canvas() //h('canvas', {width: W, height: H})
107 | var N = 200
108 | var g = {}
109 | var loc = {}
110 | for(var i = 0; i < N; i++) {
111 | if(i%10)
112 | G.addEdge(g, i, (N+i-1)%N)
113 | if((i%10) < 9)
114 | G.addEdge(g, i, (N+i+1)%N)
115 | if(i > 10)
116 | G.addEdge(g, i, (N+i-10)%N)
117 | if(i < N - 10)
118 | G.addEdge(g, i, (i+10)%N)
119 |
120 | loc[i] = {x: (i%10)*c.width/10, y: Math.floor(i/10)*c.height/20}
121 | }
122 |
123 |
124 | // var g = random.graph(N, 3)
125 | // var loc = random.locations(200, c.width, c.height)
126 | // var loc = circular(N, {x:c.width/2, y:c.height/2}, Math.min(c.width/2, c.height/2)*0.9)
127 | var label = h('label')
128 |
129 | animate(g, loc, c.getContext('2d'), function next (_, packets) {
130 | var e = 0
131 | for(var i in packets)
132 | if(packets[i].extra) e ++
133 | label.textContent = e + '/' + packets.length
134 | delay(function () { animate(g, loc, c.getContext('2d'), next) })
135 | })
136 | return h('div', c, label)
137 |
138 | })
139 |
140 | exports.spanning = starter(function () {
141 | var c = canvas() //h('canvas', {width: W, height: H})
142 | var g = random.graph(200, 3)
143 | var loc = random.locations(200, c.width, c.height)
144 | var label = h('label')
145 |
146 | animate(g, loc, c.getContext('2d'), function next (_, packets) {
147 | var e = 0
148 | for(var i in packets)
149 | if(packets[i].extra) {
150 | e ++
151 | delete g[packets[i].from][packets[i].to]
152 | delete g[packets[i].to][packets[i].from]
153 | }
154 | label.textContent = e + '/' + packets.length
155 | delay(function () { animate(g, loc, c.getContext('2d'), next) })
156 | })
157 | return h('div', c, label)
158 |
159 | })
160 | exports.fragile = starter(function () {
161 | var N = 200
162 | var c = canvas()
163 | var loc = random.locations(N, c.width, c.height)
164 | var label = h('label')
165 |
166 | //;(function next () {
167 | var g = random.graph(N, 2)
168 |
169 | animate(g, loc, c.getContext('2d'), function next (_, packets) {
170 | var e = 0
171 | for(var i in packets)
172 | if(packets[i].extra) {
173 | e ++
174 | delete g[packets[i].from][packets[i].to]
175 | delete g[packets[i].to][packets[i].from]
176 | }
177 |
178 | //randomly delete some nodes
179 | var d
180 | for(var i = 0; i < 1; i++) {
181 | var d = ~~(Math.random()*N)
182 | delete g[d]
183 | for(var i in g[d])
184 | delete g[i][d]
185 | }
186 |
187 | label.textContent = e + '/' + packets.length
188 | if(!stop()) animate(g, loc, c.getContext('2d'), next)
189 | // delay(function () { animate(g, loc, c.getContext('2d'), next) })
190 | })
191 |
192 | //})()
193 | return h('div', c, label)
194 |
195 | })
196 |
197 | var cmd = process.argv[2]
198 |
199 |
200 | document.body.appendChild(h('div',
201 | Object.keys(exports).map(name => {
202 | return h('div',
203 | h('h1', name),
204 | exports[name](),
205 | h('hr')
206 | )
207 | })
208 | ))
209 | /*
210 | if(module.main && exports[cmd])
211 | document.body.appendChild(exports[cmd]())
212 | else {
213 | console.log('expected command:'+Object.keys(exports).join(', ') + ' but got:'+cmd)
214 | // window.close()
215 | //process.exit(1)
216 | }
217 |
218 | */
--------------------------------------------------------------------------------
/grid.js:
--------------------------------------------------------------------------------
1 | var el = require('./demos').grid()
2 | document.body.appendChild(el)
3 |
4 | el.dispatchEvent(new FocusEvent('focus'))
5 |
--------------------------------------------------------------------------------
/hops.js:
--------------------------------------------------------------------------------
1 |
2 | var G = require('graphreduce')
3 |
4 | function random (N, K) {
5 | var g = {}
6 | for(var i = 0; i < N; i++) {
7 | for(var j = 0; j < K; j++) {
8 | var a = ~~(Math.random()*i)
9 | g = G.addEdge(g, a, i)
10 | g = G.addEdge(g, i, a)
11 | }
12 | }
13 | return g
14 | }
15 |
16 | function round(n, u) {
17 | u = Math.pow(10, u)
18 | return Math.round(n*u)/u
19 | }
20 |
21 | function hops (g, start, seen) {
22 | var front = {}, total, tcount = 0, redun = 0
23 |
24 | function visit (hops) {
25 | var count = 0
26 | for(var k in front) {
27 | for(var j in g[k]) {
28 | if(seen[j] == null) {
29 | front[j] = true
30 | seen[j] = hops
31 | total += hops
32 | tcount ++
33 | count ++
34 | }
35 | else
36 | redun ++
37 | }
38 | delete front[k]
39 | }
40 | return count
41 | }
42 | front[start] = true
43 | var hops = 0, total = 0
44 | while(visit(++hops));
45 |
46 | return [tcount, hops, round(total/tcount, 3), redun, round((redun/(tcount-1)), 3)]
47 | }
48 |
49 | //for(var i = 0; i < 100; i++) {
50 | console.log('|K|peers|hops|avg|msgs|inefficiency|')
51 | for(var i = 1; i <= 20; i++) {
52 | var seen = {}
53 | var g = random(1000, i)
54 | console.log('|'+[i].concat(hops(g, 0, seen)).join('|')+'|')
55 | }
56 | //}
57 |
58 |
--------------------------------------------------------------------------------
/images/redundant.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
995 |
--------------------------------------------------------------------------------
/images/test.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | ---
5 |
6 |
7 |
8 |
1093 |
1094 |
--------------------------------------------------------------------------------
/measurements/overlap.js:
--------------------------------------------------------------------------------
1 | var path = require('path')
2 |
3 | var d = require('dynamic-dijkstra')(require('dynamic-dijkstra/simple'))
4 |
5 | var g = require(path.join(process.env.HOME, '.ssb', 'flume', 'contacts2.json')).value
6 |
7 | var HOPS = 2
8 |
9 | function first (o) {
10 | for(var k in o) return k
11 | }
12 |
13 | function compare (a, b) {
14 |
15 | }
16 |
17 | function diff (a, b) {
18 | var hits = 0, misses = 0, misses2 = 0, total = 0
19 | for(var k in a)
20 | if(a[k] > 0) {
21 | total ++
22 | if(b[k] != null && b[k] > 0) {
23 | hits++
24 | }
25 | else {
26 | misses ++
27 | }
28 | }
29 | for(var k in b)
30 | if(b[k] > 0) {
31 | if(!(a[k] != null && a[k] > 0)) {
32 | total ++
33 | misses2 ++
34 | }
35 | }
36 |
37 | return {
38 | hits: hits, misses: misses, misses2: misses2, total: total,
39 | avg: hits/total
40 | }
41 | }
42 |
43 | var hops = d.traverse(g, null, HOPS, first(g))
44 | var peers = {}
45 | for(var k in hops) {
46 | if(hops[k] > 0 && hops[k] == 1) {
47 | var _hops = d.traverse(g, null, HOPS, k)
48 | var data = diff(hops, _hops)
49 | data.hops = [0,0,0]
50 | for(var k in hops)
51 | if(hops[k] > 0 && _hops[k] > 0) {
52 | data.hops[_hops[k]] = (data.hops[_hops[k]] || 0) + 1
53 | peers[k] = (peers[k] || 0) + 1
54 | }
55 |
56 | console.log(k, data)
57 |
58 | }
59 | }
60 | var dist = {}
61 | for(var k in hops)
62 | if(hops[k] > 0)
63 | dist[k] = { hops: hops[k], peers: peers[k]}
64 | //console.log(dist)
65 |
66 | for(var i = 0; i < 100; i++) {
67 |
68 | covered = {}
69 | var oneHop = Object.keys(hops).filter(function (k) {
70 | return hops[k] === 1
71 | }).sort(function () { return Math.random () - 0.5})
72 | .slice(0, 5).forEach(function (id) {
73 | var _hops = d.traverse(g, null, HOPS, id)
74 | for(var k in hops) {
75 | covered[k] = covered[k] || 0
76 | if(hops[k] > 0 && _hops[k] > 0)
77 | covered[k] ++
78 | }
79 | })
80 |
81 | function sum (a, b) { return (a || 0) + b }
82 | var times = [0,0,0,0,0,0], times2 = [0,0,0,0,0]
83 | for(var k in covered) {
84 | times2[covered[k]] = (times2[covered[k]] || 0) + 1
85 | if(hops[k] === 1)
86 | times[covered[k]] = (times[covered[k]] || 0) + 1
87 | }
88 | console.log(times)
89 | console.log(times2)
90 | var total = Object.keys(hops).length
91 | for(var k in times2) {
92 | var cuml = times2.slice(k).reduce(sum)
93 | console.log(k, times2[k], times2[k]/total, cuml, cuml/total)
94 | }
95 | }
96 |
97 |
98 |
--------------------------------------------------------------------------------
/message-passing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dominictarr/scalable-secure-scuttlebutt/321c1d624beba2b23272eedc5ac8b34d20a34c57/message-passing.jpg
--------------------------------------------------------------------------------
/message-passing2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dominictarr/scalable-secure-scuttlebutt/321c1d624beba2b23272eedc5ac8b34d20a34c57/message-passing2.jpg
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "scalable-secure-scuttlebutt",
3 | "description": "",
4 | "version": "1.0.0",
5 | "homepage": "https://github.com/dominictarr/scalable-secure-scuttlebutt",
6 | "repository": {
7 | "type": "git",
8 | "url": "git://github.com/dominictarr/scalable-secure-scuttlebutt.git"
9 | },
10 | "dependencies": {
11 | "statistics": "^3.3.0"
12 | },
13 | "devDependencies": {
14 | "graphreduce": "^3.0.4",
15 | "hyperscript": "^2.0.2",
16 | "marked": "^4.0.16",
17 | "network-animator": "^1.0.0"
18 | },
19 | "scripts": {
20 | "test": "set -e; for t in test/*.js; do node $t; done"
21 | },
22 | "author": "'Dominic Tarr' (dominictarr.com)",
23 | "license": "MIT"
24 | }
25 |
--------------------------------------------------------------------------------
/paper.md:
--------------------------------------------------------------------------------
1 |
2 | # scalable secure scuttlebutt
3 |
4 | This is notes on a paper for ssb.
5 |
6 | ## assumptions about social networks
7 |
8 | TODO: find refrences to justify this assumption.
9 |
10 | * _power law assumption_: we expect activity to follow a power law.
11 | (a small proportion of users are update very frequently,
12 | a large number, only infrequently)
13 |
14 | * _tendency for local connections_: if you are "friends" with a peer
15 | it's likely that another friend is also a friend.
16 |
17 | * _hub connections assumption_: short paths to hubs: some users are "hubs", being connected
18 | with a large number of other peers. Paths to even distant users
19 | are short due connections via hubs. It's highly likely that
20 | someone you know follows any given celebrity.
21 |
22 | For simplicity, we model the following replication protocol designs in the context of connected random swarms.
23 | In practice, we do not want a design that replicates all messages in the entire network (especially because
24 | we intend it to scale to millions of users). For social media applications, users view the feeds of they
25 | have explicitly followed/friended. However, due to the strongly connected nature of the social graph
26 | (the primary way people meet is being introduced to friends of friends) the chance of having a considerable
27 | overlap in your follow graph with any friend is quite high. Thus, the simpler to model random network is a reasonable
28 | approximation to friend replication in a real social network - how good this approximation is is discussed in
29 | **TODO: write this** section.
30 |
31 | ## data models
32 |
33 | We use a simple data model that fits most social media application.
34 | The main resource is a _feed_, which is an append-only log of _messages_,
35 | without branches. Each peer may publish zero or more feeds, and subscribe
36 | to zero or more feeds.
37 |
38 | Each message contains the id of the feed, an always increasing sequence number,
39 | and some content. (Also, the hash of the previous message and a signature, but this paper focuses
40 | on the performance of our design, not the security, so we can leave this out of discussion for now)
41 |
42 | ```
43 | Feed f = {id: f.id, log: [{ id: f.id, sequence, content }, ...]}
44 | ```
45 |
46 | In this paper we can assume that each peer is the author of one feed and that the peer's id
47 | is also the id of that feed. The peer also stores the log of all the peers that it subscribes to.
48 |
49 | ```
50 | Peer p = { id: p.id, feeds: { : f } }
51 | ```
52 |
53 | ## comparison of replication algorithms.
54 |
55 | Starting with the simplest, develop models of data replication.
56 | >I basically just made up the O() notations... maybe this should be based on simulations instead?
57 | especially since some of my arguments depend on a certain factor
58 | being limited somewhat (by the nature of networks).
59 |
60 | ### polled scan: (RSS) Really Simple Syndication
61 |
62 | A publisher (`pub`, of type `Peer`) hosts content, and a subscriber (`sub`, also of type `Peer`)
63 | connect and the publisher sends their content.
64 |
65 | At each request, the publisher sends the entire feed.
66 |
67 | > (footnote: In practice, RSS truncates the feed and may not send older messages,
68 | so isn't provably eventually consistent, we've analyzed a simplified version,
69 | which has provable eventual consistency.)
70 |
71 | This is extremely simple to implement at the server end (RSS provides an XML file over HTTP)
72 | and slightly more complex at the client end, as clients append only the new values.
73 | It's assumed that messages are fairly small, text only, and large files are referenced as some sort of link.
74 |
75 | When a subscriber connects, the publisher replies `received = pub.feeds[pub.id]`
76 | (which means sending `pub.feeds[pub.id].length` messages)
77 | the subscriber then appends any new messages to their copy of that feed.
78 | `sub.feeds[pub.id].append(received[sub.feeds[pub.id].length...])` such that both copies of the feed are the same,
79 | that is, contain copies of the same messages. `sub.feed[pub.id] == pub.feed[pub.id]`.
80 |
81 | New messages are published over time, and so the subscriber periodically makes a request to each publisher.
82 |
83 | ```
84 | interval(sub.pollFrequency, () => sub.feeds.each(id => sub.connect(id)) )
85 | ```
86 |
87 | So, every `sub.pollFrequency` all publishers are connected to and all messages from them are downloaded,
88 | old messages end up being sent many times unnecessarily, so the amount of bandwidth needed scales very badly.
89 |
90 | Bandwith needed for a subscriber can be calculated as the following:
91 |
92 | > (footnote: Assume that `pollFrequency` is number of polls
93 | within the given timeframe that we are calculating resource usage for. The important thing is how many polls are made.
94 | If we were to calculate usage per day, and there was one poll per day, pollFrequency is 1. In any case, we are
95 | more interested in exploring the relationship between the various design factors and resources used, so the important
96 | thing to observe here is that the total resource used is _multiplied_ by `pollFrequency`, doubling `pollFrequency` doubles
97 | the number of messages sent)
98 |
99 | ```
100 | total_messages = sum(map(sub.feeds, id => sub.feeds[id].length))
101 | sub.pollFrequency * total_messages
102 | ```
103 | Each interval, the subscriber polls every publisher, and receives all messages.
104 | Hence the total set of messages is redownloaded every interval.
105 |
106 | Bandwith needed for the publisher can be calculated as the following:
107 |
108 | ```
109 | subscribers = sum(peers, peer => peer.feeds[pub.id] ? 1 : 0 ))
110 | avg_poll_frequency = sum(peers, peer => peer.feeds[pub.id] ? peer.pollFrequency : 0 )) / subscribers
111 | subscribers * avg_poll_frequency * pub.feed[pub.id].length
112 | ```
113 |
114 | Clients have a tradeoff between bandwidth and latency. Either they use lots of bandwidth
115 | or wait a long time for new messages. So this design is not suitable for real-time communication.
116 |
117 | For publishers, this design also suffers from uncontrollable expenses. If there are suddenly
118 | many subscribers, or they set their `pollFrequency` very high, this increases costs for the
119 | publisher, which in practice will lead to outages. Thus the most popular content is the most
120 | likely to be unavailable, which is the opposite of what is needed.
121 |
122 | Also, this model uses a network connection per poll, is likely to be a
123 | limiting factor for publishers with large numbers of subscriptions.
124 |
125 | The total number of network connections over some time period
126 | is for the subscriber:
127 |
128 | `avg_poll_frequency * sub.feeds.length`
129 |
130 | and the publisher:
131 |
132 | `poll_frequency * subscriptions`
133 |
134 | ## append-only poll
135 |
136 | Messages in a feed have a total order defined by an always increasing
137 | value such as a sequence, such that any message's sequence is strictly greater
138 | than any preceding message. If the sequence number of the first message is 1,
139 | then the number of messages in the feed (`feed.length`) is also the sequence number of the last item.
140 |
141 | > (footnote: By sending messages in order, if a transmission fails part-way,
142 | the requester's copy of the feed is still a valid append-only log with no gaps - but their latest
143 | message is just not the true latest message. Next time they connect they will receive the missing messages.)
144 |
145 | Instead of sending all messages per poll, the subscriber requests all messages greater
146 | than the sequence number of the latest message they currently have.
147 | This requires sending on a tiny header (the sequence number)
148 | and the publisher only sends each message to each subscriber once.
149 |
150 | The publisher expects a sequence number, and returns any messages greater than that.
151 | ```
152 | pub.serve(n => pub.feeds[pub.id][n...])
153 | ```
154 |
155 | The subscriber connects to a pub, and appends the messages the pub returns to their copy,
156 |
157 | ```
158 | received = sub.connect(pub.id, sub.feeds[pub.id].length)
159 | sub.feeds[pub.id].append(received)
160 | ```
161 | now the subscriber is consistent with the publisher.
162 |
163 | > (footnote: The publisher sends the messages in order, so if a connection fails part-way
164 | through, the subscriber's copy still has sequential messages
165 |
166 | The cost for the subscriber is as follows
167 |
168 | ```
169 | sub.pollFrequency * sub.feeds.length + total_messages
170 | ```
171 |
172 | This is a significant improvement over polled scan because each message is only downloaded once.
173 | However, the subscriber must still send their current sequence number to each publisher, on each poll.
174 | Although we can resonably assume that the sequence number is significantly smaller
175 | than a message, if the `pollFrequency` or `sub.feeds.length` is high this can become significant.
176 |
177 | The number of connections needed are the same as polled scan.
178 |
179 | For a suddenly popular publisher, many incoming requests can still lead to availability problems,
180 | as the simple number of requests becomes overwhelming, although because the entire feed of messages
181 | does not need to be sent the practical limit is much higher.
182 |
183 | ## append-only gossip (scuttlebutt)
184 |
185 | In a gossip protocol, instead of subscribers polling publishers,
186 | "peers" which can be both publisher and subscriber, connect to each other randomly.
187 | On each connection, instead of requesting a single feed, peers send a "vector clock".
188 | Instead of representing a global sequence, a vector clock just includes the sequence on each
189 | peer that contributed to the state. A peer's current vector clock is just a map of the lastest
190 | sequence of each feed:
191 |
192 | ```
193 | vector_clock = map(peer.feeds, id => peer.feeds[id].length)
194 | ```
195 |
196 | When a peer receives the remote vector clock, they can simply calculate whether there are
197 | any messages they need to send and send them.
198 |
199 | ```
200 | peer.serve(clock => mapValues(clock, (id, sequence) => peer.feeds[id][sequence...]))
201 | ```
202 |
203 | A client just connects to a random peer, sends their clock, and appends messages they receive
204 |
205 | ```
206 | each(
207 | peer.connect(random_peer.id, vector_clock),
208 | msg => peer.feeds[msg.id].append(msg)
209 | )
210 | ```
211 |
212 | Since a connection now sends the list of subscriptions,
213 | but only needs to connect to a single peer each poll interval,
214 | more bandwidth is used per connection, but less connections are used.
215 | The overall bandwidth used by a peer is the same as with append-only poll,
216 | but the number of connections is now only `O(poll_frequency)`.
217 |
218 | Because messages are no longer passed directly from the publisher to each subscriber,
219 | describing the time needed to disseminate a new message is more complicated.
220 | In the first poll interval, the publisher will be connected to at least 1 other peer.
221 | (The publisher makes 1 outgoing connection, but may receive any number of incoming connections.)
222 | If it gets passed to only a single peer, but in the second poll interval, there are now two peers able
223 | to disseminate the message. If they do not connect again, in the 3rd interval
224 | there will be 4 peers, and so on in powers of 2. However, as the number of peers
225 | with a given message increases the chance that any two connecting peers already both have the
226 | message increases too, and the rate of dissemination decreases. Thus overall rate
227 | of dissemination resembles an S curve. Since calculating the actual rate of dissemination
228 | is more complicated, and is affected by practical matters such as the probability that
229 | multiple peers connect a particular peer at once, instead of calculating
230 | the time, we take measurements from a simple simulation.
231 |
232 | The pattern of dissemination of a single message is the same as flooding gossip.
233 | For a random network with 10,000 peers and each peer creating a connection to one
234 | other peer randomly each interval (so a given peer may receive zero or more incoming connections,
235 | but makes only one outgoing connection), the total number of intervals needed
236 | to diseminate a single message is very small compared to the number of peers.
237 |
238 | ```
239 | round, dR, dT
240 | 1, 9, 10
241 | 2, 51, 61
242 | 3, 293, 354
243 | 4, 1195, 1549
244 | 5, 3903, 5452
245 | 6, 3875, 9327
246 | 7, 666, 9993
247 | 8, 7, 10000
248 | ```
249 |
250 | In Amazon Dynamo, this protocol design is used to replicate
251 | membership information within a cluster of Dynamo nodes.
252 | The peers run inside a trusted enviroment, and all peers replicate
253 | all other peers. To add a peer to the network, that peer just
254 | needs to know any other peer. It's not necessary to inform
255 | any master node, and the cluster is highly resilient.
256 |
257 | This design has a significant advantage with availability.
258 | If a peer that originated a message goes offline, if they
259 | have disseminated a message to at least one other peer that message
260 | will continue to flood the network. If a publisher suddenly
261 | becomes very popular, it will not cost them extra resources,
262 | because it's the other peers which will provide the dissemination.
263 |
264 | ## update frequency, overlap, and peer selection
265 |
266 | In Amazon Dynamo, scuttlebutt replication is used as a subcomponent
267 | of the whole system - to keep track of the membership in the
268 | database cluster, and what range of the database each node is
269 | responsible for. When database requests come to a node, that
270 | information is used to route the request to nodes which can handle
271 | it. Each node therefore needs to replicate _all information_ about
272 | membership in the cluster, and also, that information must be kept
273 | continually up to date. Each node emits a regular heartbeat and
274 | this is gossiped across the cluster, and the other nodes use this
275 | information to calculate the probability that a given node is still
276 | active - thus wether requests should be routed to it.
277 |
278 | Other applications using are likely to differ in terms of whether
279 | peers need to replicate the entire dataset, or the regularity with
280 | which they broadcast updates, or both. For example, a chat
281 | application combines messages from everyone one in the "room", so
282 | each peer replicates the entire dataset, but each peer only
283 | writes messages to the chat as frequently or infrequently as they
284 | choose to. It's quite likely that a few peers write very frequently
285 | and others read but do not write, or write very little.
286 |
287 | Indeed, in most real world applications, not all updates are
288 | created on a regular basis. There may be a small number of people
289 | you communicate with frequently - the closest family and friends,
290 | but then a broad range of aquaintances that you speak with
291 | occasionally. This pattern, known as a power-law distribution,
292 | is frequently found in both natural and artificial phenomena.
293 | Books and Movies are dominated by a small amount of best sellers,
294 | but also a large number of cult classics, flops, or break-evens.
295 | Most companies fail in the first few years, but a small number
296 | become so successful that it offsets venture investments
297 | in all the companies that fail. Likewise, it's reasonable to
298 | expect that most applications, if they do not have an explicitly
299 | regular update pattern, such as a enviromental sensor, will
300 | probably have activity following a power law, in the distribution
301 | of updates. However, if many peers have only infrequent update,
302 | it's likely that any two peers will exchange vector
303 | clocks with mostly the same values, and this is wasted bandwidth.
304 |
305 | The other question, what portion of the dataset should be replicated
306 | to each node? In Dynamo, or the chat room, the replicated data is replicated to
307 | all nodes, but in most other applications, it's not really diserable
308 | for all peers to have all data. For example, in email, the only peers that really need a particular
309 | message are the sender and the receiver (mail forwarding agents are a necessary evil)
310 |
311 | Email is probably not suited to a replication pattern, as only
312 | the recipient and sender are intended to have a use for a given
313 | message, and email has enough problems with spam that replicating
314 | 3rd party messages seems strange. On the other hand, social media,
315 | seems extremely well-suited to a replication design: firstly,
316 | content is already log-oriented. typically, users explicitly
317 | "follow" or "friend" each other, and the main user interface element
318 | is viewing a combined feed of all follow's messages. "shares"
319 | are broadcast, usually intended to be read by all followers/friends.
320 | Each peer may want to only replicate their friend's data, but
321 | since the main way of meeting new friends is by meeting your friend's
322 | friends, there is a good chance that any friend also holds messages
323 | you wish to replicate.
324 |
325 | If less than the entire dataset is to be replicated to each peer,
326 | this also raises the question of _which peers to connect to?_
327 | in email, this is not an easy question to answer, as any one knowing
328 | your email address can send you messages. On the other hand,
329 | social media applications present an elegant answer to this question:
330 | The peers you follow are the peers you should connect to, you
331 | are likely to share mutual friends with them, and thus they are
332 | likely to have the feeds you are looking for, and want the feeds
333 | you have.
334 |
335 | A social media application provides good an simple ways
336 | to both choose a partial dataset to replicate and choose who to
337 | replicate it with, and because of the high degree of connectivity
338 | of the social graph, it seems extremely likely that such an
339 | application built on top of an efficient gossip replication protocol
340 | could easily scale to an unlimited number of users. Provided
341 | the implementation can scale to the needs of most individual users,
342 | each user's data overlaps with their friends, and thus the network
343 | could easily cover the entire globe.
344 |
345 | The design we come up with here could be used in any application
346 | that needs to replicate data with a few thousand peers, wether
347 | the dataset be shared fully, or having a well defined overlap.
348 | We present a social media application only as a relatively
349 | flexible base-architecture.
350 |
351 | ## append-only gossip with request-skipping
352 |
353 | In practice, activity in most datasets follows a power law:
354 | some authors are highly prolific, but most only publish rarely.
355 | Thus, it is likely that when two peers exchange a vector clock in
356 | append-only gossip, the majority of feeds mentioned have not changed.
357 |
358 | > (footnote: Indeed, this became a practical problem in secure-scuttlebutt,
359 | on each connection, each peer sending over half a megabyte of requests,
360 | yet not actually needing to send any messages.)
361 |
362 | The chance that no new messages are sent during a connection increases
363 | with `poll_frequency`.
364 |
365 | _request-skipping_ is an optimization to avoid making feed requests if it seems unlikely
366 | that a feed has changed, it requires storing the received clock from remote peers,
367 | but saves sending many headers after the first connection.
368 |
369 | On the first connection between two peers, the entire clock is sent, but on subsequent connections,
370 | the current clock is compared with the stored copy of the remote clock, and only the feeds that differ are sent.
371 |
372 | ```
373 | // first connection
374 | local_clock = map(peer.feeds, id => peer.feeds[id].length)
375 | // take the stored remote clock, or an empty clock if this is the first connection.
376 | remote_clock = peer.clocks[remote.id] || {}
377 | conn = peer.connect(remote.id)
378 |
379 | conn.send(filter(local_clock, (id, seq) => remote_clock[id] != IGNORE && remote_clock[id] != seq))
380 |
381 | remote_clock2 = conn.recv()
382 | remote_clock = peer.clocks[remote.id] = merge(remote_clock, remote_clock2)
383 |
384 | // if they have requested feeds we did not send, send our current seq for those feeds.
385 | conn.send(map(
386 | filter(remote_clock2, (id, seq) => local_clock[id] != seq),
387 | id => local_clock[id] || IGNORE
388 | ))
389 |
390 | // finally, send any needed messages
391 | conn.send(mapValues(remote_clock, (id, seq) => if local_clock[id] > seq && seq != IGNORE then peer.feeds[id][seq...]))
392 | each(conn.recv(), msg => peer.feeds[msg.author].append(msg))
393 | ```
394 |
395 | `IGNORE` is a special value used to indicate that the remote has requested a feed that we choose not to replicate.
396 | It is necessary to make a definite response in this case, because this enables the remote to remember we are not interested
397 | in this feed, and so they will avoid requesting this feed next time they respond.
398 |
399 | Once we receive the remote's clock and have compared it to the stored copy,
400 | we can calculate everything that needs to be send or received. In practice,
401 | long-lived connections are used, and we allow new clocks to be sent at any time,
402 | but for simplicity of describing the algorithm we represent it here as having 5 phases:
403 | send initial clock, receive remote clock, send response clock, send messages, receive messages.
404 |
405 | > (footnote: It is essential that we only update our record of the remote clock with data they have explicitly sent
406 | us, and _not_ based on the messages we have sent them. It is possible that a connection fails before
407 | our peer receives a message, but if they send us something we know they meant it.)
408 |
409 | If peers A and B are consistent with respect to feed X, neither will mention X the next time they connect.
410 | However, if either peer receives a new message in X, one of them will mention it and the other will respond,
411 | and the first will send the message. If both receive the new message before they next reconnect, they'll both
412 | mention it, but see they are at the same message and not send it.
413 |
414 | If peer A requests a feed id X that B has not chosen to replicate, B receives `X: ` from A,
415 | and will reply with `X: IGNORE`.
416 | A will store `A.clocks[B.id][X] = IGNORE`, and B will store `B.clocks[A.id][X] = `.
417 | `IGNORE` is never sent in the initial clock, only in the response. If B later chooses to replicate X,
418 | the next time they connect to A, they'll check their current sequence (which will be 0 at the time they choose
419 | to replicate X),
420 | against the stored clock for B. They'll see that it's different and send `X: 0`
421 | in the initial clock. A will then see that B is no longer ignoring X, and will respond with their
422 | sequence for X. If B doesn't change their mind about X, A will never mention it again.
423 |
424 | > (footnote: In the case that B decides to replicate X, but somehow ends up with the same sequence
425 | that A has for X, then they won't mention it, however, sooner or later, they will receive a new
426 | message in X from someone else, and after this will mention it to A)
427 |
428 | The worst case, for two given peers exchanging a single feed, is when the poll frequency
429 | is greater or equal to the frequency that new messages are added. This means that each
430 | peer sends a vector clock element for every message added to that feed, so the maximum
431 | number of vector clock elements is the same as the number of messages sent. If the poll
432 | frequency is lower than the message frequency, efficiency increases as each vector clock
433 | element will correspond to potentially many messages. Since this at worst a constant
434 | factor of the number of messages, it's within acceptable bounds and poll frequency can be
435 | selected for maximum availability without trading off bandwidth usage.
436 |
437 | It is expected that in practice, message frequency differs greatly by feed.
438 | Request skipping saves sending vector clocks elements for infrequently updating
439 | feeds, so a great deal less vector clock elements need be sent than in append-only gossip,
440 | especially when using high poll frequencies.
441 |
442 | ```
443 | messages + (peers_connected_to * peer.feeds.length) + (peer.pollFrequency / messages)
444 | ```
445 |
446 | There is now only one multiplicative factor in the bandwidth complexity.
447 | We must send the entire vector clock to each peer that we will connect to,
448 | the first time we connect to them. However, luckily, to get provable eventual
449 | consistency, we do not actually need to connect to every peer. As messages
450 | are relayed, we only need the eventual connections to form a connected graph,
451 | _not_ for each peer to eventually connect. Consequently, a value for
452 | `peers_connected_to` can be somewhat smaller than the whole swarm.
453 |
454 | Simulating random networks with varying numbers of random connections, the
455 | measured probability that the graph is fully connected rapidly approaches 1
456 | as the average number of connected peers passes 2. As the number of edges
457 | continues to rise, the distance across the graph (and thus dissemination rate)
458 | drops.
459 |
460 | ```
461 | edges, P(connected), average, stdev
462 | 1, 0.05, 57.26, 19.385365614297818
463 | 1.1, 0.46, 23.33, 2.549725475418886
464 | 1.2, 0.69, 18.1, 1.6763054614240047
465 | 1.3, 0.7, 15.08, 1.188949115816149
466 | 1.4, 0.8, 13.52, 1.2765578717786399
467 | 1.5, 0.91, 12.33, 0.8130805618141443
468 | 1.6, 0.9, 11.45, 0.82915619758885
469 | 1.7, 0.96, 10.59, 0.8011866199581761
470 | 1.8, 0.97, 9.83, 0.6333245613427602
471 | 1.9, 0.99, 9.29, 0.4958830507287036
472 | 2, 1, 8.72, 0.5306599664568481
473 | 3, 1, 6.91, 0.2861817604250792
474 | 5, 1, 5.39, 0.48774993593029137
475 | 10, 1, 4.59, 0.4918333050943186
476 | 20, 1, 4, 0
477 | ```
478 |
479 | I would suggest using a fixed number of connections per peer in the range 5-10,
480 | would effectively gaurantee a fully connected network, and small dissemination rate,
481 | without scaling the number of full vector clocks to be sent by very much.
482 |
483 | Also note, this design requires storage of vector clocks, so reducing the number
484 | of peers connected to also keeps that within acceptable bounds.
485 |
486 | ## overlapping replication sets
487 |
488 | So far, we have analyzed the problem space as if all peers under consideration
489 | are replicating the same set of publishers. In some application designs it
490 | may make sense for all peers to replicate the same set of feeds, for example,
491 | in a task tracking system within a medium sized company or other organization.
492 | On the other hand, the really interesting use-cases are ones that scale to millions
493 | of users, and so it might not feasible to replicate all their data on the average device,
494 | even if you did want to. In secure-scuttlebutt, the target application is a social network.
495 | This provides an interesting middle ground, with both a fair amount of overlap and a
496 | reasonable expectation of it, since one of primary ways that people meet new friends
497 | is by meeting friends of friends. These encounters might be more or less formal,
498 | but nevertheless, the chance that any two friends have a number of mutual friends in
499 | common is fairly high.
500 |
501 | In the most conservative design, it might be desired to replicate only the direct
502 | friends "followed" by the user. If the follow graph is known, a set of replication
503 | peers can be carefully selected to ensure coverage of all follows. For each feed
504 | a remote peer follows that the local peer does not, an feed id and `IGNORE` will be sent,
505 | but after that, subsequent requests for that feed will be skipped.
506 |
507 | In the current secure-scuttlebutt design, by default peers replicate their friends,
508 | and the friends of their friends. Sampling the actual ssb data, choosing 5 random
509 | peers to replicate, and replicating feeds two hops out on the follow graph (friends,
510 | and friends of friends), in all samples, all the direct friends of the user were
511 | within 2 hop range of the 5 random peers, also on average ~75% (TODO: GRAPHS THESE)
512 | of friends of friends were replicated by at least one peer. In ssb, since this could
513 | be more carefully optimized by selecting peers carefully to maximize coverage, and
514 | since request-skipping means we'll only send headers for unreplicated feeds one time,
515 | we can just connect to more random feeds and still get acceptable efficiency.
516 |
517 | ## real-time broadcast
518 |
519 | It is obviously desirable that a communication network would
520 | carry messages quickly. For human to human text communication,
521 | latency within a few seconds is usually sufficient. However,
522 | most of the above replication strategies would be unviable
523 | with `poll_frequency` of a few seconds, not to mention, establishing
524 | a TCP connection has overhead, and several extra messages must be
525 | passed to make that an encrypted TCP connection. So, instead of
526 | simple polling, we should have connections with a longer lifespan -
527 | when a new connection is formed we exchange clocks and receive any
528 | old messages we are mssing, via the above polling algorithms,
529 | but then we "stay on the line", and if our peer receives any
530 | additional messages they send those too.
531 | Thus, we our model becomes _sync then broadcast_.
532 |
533 | In the non-gossip models, we must eventually connect to every
534 | peer we subscribe to. It would be unviable to hold long-lived
535 | connections to every peer, as they may number in the thousands,
536 | and the overhead of a each connection would be too much for
537 | most user devices. But with gossip, we can connect to just a small
538 | number of peers at a time and still receive messages from many peers.
539 |
540 | ## random connected network
541 |
542 | N peers are randomly connected with average K outgoing connections per peer.
543 | (outgoing, because each peer randomly chooses to connect to K other
544 | peers) as discussed in the previous section, the chance that the network
545 | is fully connected rapidly approaches 1 when as K approaches 2, and
546 | then the average shortest path between nodes shortens as redundant connections increase.
547 | For the network to broadcast a message, the originating peer sends it to all
548 | neighbouring peers, and when a peer receives a _new_ message,
549 | they send it to all their connected peers except the peer they received
550 | the message from. Consider a network with 3 peers and 2 connections each.
551 | A creates a new message and transmits a message to B and C, and B and C then
552 | transmit the message to each other. Thus the message is sent twice
553 | by A and once each by B and C. The total bandwidth used by the
554 | network is 4. Since A creates the message and there are only
555 | two other peers, only the transmissions to B and C are necessary,
556 | but B and C don't know that the other already has the message.
557 |
558 | Simulating a broadcast in a random network with up to 20 connections
559 | per peer, and measuring hops, average hops, messages transferred:
560 |
561 | |K|peers|hops|avg|msgs|inefficiency|
562 | |-|-----|----|---|----|------------|
563 | |1|1000|14|6.657|999|1|
564 | |2|1000|7|3.657|2981|2.984|
565 | |3|1000|6|2.944|4947|4.952|
566 | |4|1000|5|2.842|6913|6.92|
567 | |5|1000|5|2.605|8861|8.87|
568 | |6|1000|5|2.515|10803|10.814|
569 | |7|1000|4|2.388|12731|12.744|
570 | |8|1000|4|2.361|14671|14.686|
571 | |9|1000|4|2.306|16605|16.622|
572 | |10|1000|4|2.193|18487|18.506|
573 | |11|1000|4|2.201|20357|20.377|
574 | |12|1000|4|2.136|22237|22.259|
575 | |13|1000|4|2.118|24163|24.187|
576 | |14|1000|4|2.118|25993|26.019|
577 | |15|1000|4|2.027|27877|27.905|
578 | |16|1000|4|2.008|29709|29.739|
579 | |17|1000|4|2.046|31567|31.599|
580 | |18|1000|4|1.994|33393|33.426|
581 | |19|1000|4|1.94|35281|35.316|
582 | |20|1000|4|1.933|37135|37.172|
583 |
584 | > (footnote: With 1000 peers and one connection we only need to send
585 | 999 messages because the first peer is the author of the message
586 | and did not need to send it.)
587 |
588 | Note, with more than one connection, number of hops (which is
589 | the time taken for the last message to arrive) decreases slowly,
590 | but the average case, time for 50% of the network to receive the message,
591 | decreases much quicker and the (bandwidth)
592 | inefficiency increases fastest.
593 | With K=2, nearly 3 times as many messages as necessary are sent.
594 | and with K=5, nearly 9 times too many messages are sent!
595 |
596 | So with a simple flooding design, we pay a lot in bandwidth for reducing latency.
597 |
598 | If we were to prune the redundant connections, we could get low latency
599 | without bandwidth overhead. However, since a pure spanning
600 | tree has no redundency it's also very fragile. If one connection close
601 | to the root of the tree (the originator of a message) fails, all downstream
602 | peers will be cut off.
603 |
604 | ## spanning trees
605 |
606 | Epidemic broadcast trees (EBT) is an algorithim to form a spanning tree from
607 | a random network, but instead of completely removing redundant connections,
608 | they are just moved into a _lazy_ or _pull_ state. When in the lazy state,
609 | only headers (equivalent to vector clock elements) are sent. Which connections
610 | are redundant can be detected by each peer observing the order in which they
611 | first receive a message, and thereafter observing latency. For example, in the
612 | 3 node network discussed in the previous section, A transmits a message to B and C,
613 | neither of them have received this message before, so they know that their connection
614 | to A is not redundant. Then, they each receive a second copy of the message from B,C
615 | so they both know that for messages from A, the connection between B-C is redundant.
616 | So, B and C exchange short messages each requesting the other to disable that connection (for messages from A). When A broadcasts another message, B and C receive
617 | it directly from A again, but since the redundant connections are disabled, they do not
618 | transmit it again. Instead, they only send a short message, equivalent to a vector clock
619 | element, to indicate they know this message exists. If later, the connection between
620 | A and C breaks, and A broadcasts another message. It will only be received by B.
621 | B then sends the short lazy check to C, who then realizes that this is the first they
622 | have heard about this message - therefore, B must now be closer to the source than they are.
623 | C then sends a message to re-request active transmission of messages from A, and B sends
624 | the message to C. (note, re-establishing an active connection takes just one round-trip)
625 |
626 | 
627 |
628 | EBT still sends redundant data, but the notes sent along the redundant connections
629 | are significantly smaller than the messages. Also, if a delay is introduced,
630 | it is not necessary to send a note for every message, but just the latest message.
631 | If several are received in quick succession, only one note needs to be sent.
632 | Also, if a random factor, somewhat greater than round trip time is added,
633 | then 50% of the time the same note is received before it is sent.
634 |
635 | For example, B and C receive the message from A at approximately the same time,
636 | if B decides to wait one second, and C waits two seconds, and the note from B to C arrives in
637 | 0.1 seconds, C knows that B already knows about that message, and now does not need to send a note back.
638 |
639 | # singleton hub
640 |
641 | > (footnote: To make the strongest arguement for the performance of EBT + request-skipping,
642 | > compare it to a fully centralized model.)
643 |
644 | To this point, most social networks have been implemented
645 | along a star shaped network. Essentially one peer that distributes
646 | all messages to all peers. If this was designed around a replication
647 | protocol, a client would use something like the append-only poll,
648 | except the server would remember each client's vector clock at each timestamp,
649 | all their subscriptions, and the client would only send the time they last synced.
650 | The server would then send all new messages on any of their subscriptions.
651 |
652 | On each connection, the client needs to send their last connection time,
653 | and the server still has to send each message. If a client polls at low rate,
654 | the client sends one header and receives many messages. If the client
655 | polls at a high rate, maybe they make one request per message. (Long-lived
656 | connections would also help here.)
657 |
658 | They would request the sequence number representing
659 | their own read feed, on each connection they'd request any messages
660 | that have occured since the last connection, but the central server
661 | still has to send the messages.
662 |
663 | `O(poll_frequency + messages)`
664 |
665 | the central server of course, must pay for a lot of resources,
666 |
667 | bandwidth:
668 |
669 | `O(network_clients * poll_frequency + peers * messages)`
670 | and connections:
671 |
672 | `O(network_peers * poll_frequency)`
673 |
674 | If a network is successful, `network_clients` can easily get very very
675 | large: millions or billions of clients.
676 |
677 | ## conclusion
678 |
679 | An idealized centralized network is presented as the best possible in efficiency,
680 | yet it only beats our design by a constant factor. Between EBT with a fixed number
681 | of peers and request-skipping, we can manage the bandwidth performance, but the main difference
682 | is only in vector clock elements, which are very small compared to messages.
683 |
684 | In the current secure-scuttlebutt implementation, which uses base64 encoded strings to
685 | encode 256 bit public keys plus a base 10 integer, vector clock elements are about 60 bytes,
686 | and the average message is 660 bytes (although maximum message is 8kb) so the average message is 11 times
687 | bigger than a single vector clock element.
688 |
689 | I would expect, that for a typical peer, most messages would be replicated after being offline for a while,
690 | so one vector clock element brings in many messages. For messages replicated in real-time,
691 | the extra bandwidth used is managed by limiting the number of connections.
692 |
693 | The performance of our design is close enough to the optimal centralized system to realistically
694 | argue that it's viable at massive scale. In practice, we believe that small difference will easily
695 | be made up by all the other advantages by adopting a decentralized system.
696 | For example, the significant costs associated with running such a system are now spread around the
697 | network participants evenly. With a fully decentralized gossip protocol, peers can join in any
698 | topology. If two peers are offline, but nearby each other, it is possible for them to share data
699 | directly over bluetooth, wifi, or by directly exchanging physical media. This means secure-scuttlebutt
700 | is potentially able to service remote areas of the earth that have not yet received modern infrastructure,
701 | as well as areas where that infrastructure is disrupted by warfare or other disasters.
702 |
703 |
704 |
705 |
706 |
--------------------------------------------------------------------------------
/random.js:
--------------------------------------------------------------------------------
1 | var el = require('./demos').random()
2 | document.body.appendChild(el)
3 |
4 | el.dispatchEvent(new FocusEvent('focus'))
5 |
--------------------------------------------------------------------------------
/references.md:
--------------------------------------------------------------------------------
1 |
2 | Efficient Reconcilliation and Flow Control for Anti-Entropy Protocols
3 | https://www.cs.cornell.edu/home/rvr/papers/flowgossip.pdf
4 |
5 | Dynamo: Amazon's Highly Available Key-value Store
6 | https://www.allthingsdistributed.com/files/amazon-dynamo-sosp2007.pdf
7 |
8 | Timestamps in Message-Passing Systems That Preserve Partial Ordering
9 | http://zoo.cs.yale.edu/classes/cs426/2012/lab/bib/fidge88timestamps.pdf
10 |
11 | RSS specification
12 | https://cyber.harvard.edu/rss/rss.html
13 |
14 | Atom Specification
15 | https://tools.ietf.org/html/rfc4287
16 |
17 | Epidemic Broadcast Trees
18 | http://www.gsd.inesc-id.pt/~ler/reports/srds07.pdf
19 |
20 |
21 |
--------------------------------------------------------------------------------
/repair-lazy.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dominictarr/scalable-secure-scuttlebutt/321c1d624beba2b23272eedc5ac8b34d20a34c57/repair-lazy.jpg
--------------------------------------------------------------------------------
/simulation/connected.js:
--------------------------------------------------------------------------------
1 | var stats = require('statistics')
2 |
3 | var N = 1000, M = 5
4 |
5 | function isEmpty (o) {
6 | for(var k in o) return false
7 | return true
8 | }
9 |
10 | function test (N, M) {
11 | var g = {}
12 |
13 | for(var i = 0; i < N; i++) {
14 | g[i] = g[i] || {}
15 | for(var k = 0; k + Math.random() < M; k++) {
16 | var j = ~~(Math.random()*N)
17 | g[j] = g[j] || {}
18 | g[i][j] = g[j][i] = true
19 | }
20 | }
21 |
22 | var reachable = {}
23 | var next = {}, hops = 0, connected = 1
24 |
25 | reachable[0] = next[0] = true
26 |
27 | while(!isEmpty(next)) {
28 | hops ++
29 | for(var k in next) {
30 | for(var j in g[k])
31 | if(!reachable[j]) {
32 | connected ++
33 | reachable[j] = true
34 | next[j] = true
35 | }
36 | delete next[k]
37 | }
38 | }
39 | return { connected: connected === N, reachable: connected, hops: hops }
40 | }
41 |
42 | console.log('P(edge), P(connected), average, stdev')
43 |
44 | ;[1.0,1.1,1.2,1.3,1.4,1.5,1.6, 1.7,1.8,1.9,2,3,5,10, 20].forEach(function (m) {
45 | var prob = 0, dist = stats.initial()
46 | var c = 0
47 | for(var i = 0; i < 100;i++) {
48 | var data = test(N, m)
49 | dist = stats(dist, data.hops)
50 | if(data.connected)
51 | c++
52 | }
53 | console.log([m, c/100, dist.mean, dist.stdev].join(', '))
54 | })
55 |
56 |
--------------------------------------------------------------------------------
/simulation/flood.js:
--------------------------------------------------------------------------------
1 |
2 |
3 | console.log('round, dR, dT')
4 | var n = {}, C = 1, N = 10000
5 |
6 | for (var i = 0; i < N; i++)
7 | n [i] = false
8 |
9 | n[0] = true
10 | var k = 1
11 | while(C < N) {
12 | var m = 0, nn = {}
13 | for(var i = 0; i < N; i++) {
14 | var j = ~~(Math.random()*N)
15 | if(n[i] != n[j]) {
16 | m++
17 | n[i] = n[j] = true
18 | //nn[i] = nn[j] = true
19 | C++
20 | }
21 | }
22 | // for(var K in nn)
23 | // n[K] = nn[K]
24 | console.log([k++, m, C].join(', '))
25 | }
26 |
27 |
28 |
29 |
30 |
--------------------------------------------------------------------------------
/simulation/friends.js:
--------------------------------------------------------------------------------
1 |
2 | var B = 0.8,
3 |
--------------------------------------------------------------------------------
/simulation/spanning.js:
--------------------------------------------------------------------------------
1 | function toLetter (i) {
2 | return String.fromCodePoint('A'.codePointAt(0) + i)
3 | }
4 |
5 | var N = 20, k = 3
6 | var g = {}
7 | var hops = {}
8 | hops.A = 0
9 | g.A = {}
10 | for(var i = 1; i < N; i++) {
11 | g[toLetter(i)] = g[toLetter(i)] || {}
12 | var j = ~~(Math.random()*i)
13 | console.log('->', toLetter(j), toLetter(i), g)
14 | g[toLetter(j)][toLetter(i)] = 1
15 | }
16 |
17 | for(var i = 0; i < N; i++) {
18 | for(var ii = 1; ii +Math.random() < k; ii++) {
19 | var j = ~~(Math.random()*N)
20 | g[toLetter(i)][toLetter(j)] = 1
21 | }
22 | }
23 |
24 | function isEmpty (e) {
25 | for(var k in e) return false
26 | return true
27 | }
28 |
29 | function spanning (g) {
30 | var next = {}
31 | var reachable = {}
32 | var s = {}
33 | next['A'] = true
34 | reachable['A'] = 0
35 | while(!isEmpty(next)) {
36 | for(var k in next) {
37 | for(var j in g[k]) {
38 | if(reachable[j] == undefined) {
39 | s[k] = s[k] || {}
40 | s[k][j] = reachable[k] + 1
41 | reachable[j] = reachable[k] + 1
42 | next[j] = true
43 | }
44 | }
45 | delete next[k]
46 | }
47 | }
48 |
49 | var hops = {}
50 | // console.log('S', s, '...', g)
51 | return {hops: reachable, spanning: s}
52 | }
53 |
54 |
55 | console.log(g)
56 |
57 | function remap (g) {
58 | var s = spanning(g)
59 | var remap = {}, i = 0
60 | for(var k in s.hops)
61 | remap[k] = toLetter(i++)
62 | console.log(s.spanning)
63 | console.log(s.hops)
64 | console.log(remap)
65 | var _g = {}
66 | for(var j in g)
67 | for(var k in g[j]) {
68 | _g[remap[j]] = _g[remap[j]] || {}
69 | _g[remap[j]][remap[k]] = g[j][k]
70 | }
71 | console.log("G", _g)
72 | return spanning(_g)
73 | }
74 |
75 | console.log(remap(g).spanning)
76 |
77 |
--------------------------------------------------------------------------------