├── .travis.yml ├── LICENSE ├── README.md ├── centralized.js ├── demos.js ├── grid.js ├── hops.js ├── images ├── redundant.svg ├── spanning-with-redundant.svg ├── spanning-without-redundant.svg └── test.svg ├── index.html ├── measurements └── overlap.js ├── message-passing.jpg ├── message-passing2.jpg ├── package.json ├── paper.md ├── random.js ├── references.md ├── repair-lazy.jpg └── simulation ├── connected.js ├── flood.js ├── friends.js └── spanning.js /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - 0.6 4 | - 0.8 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 'Dominic Tarr' 2 | 3 | Permission is hereby granted, free of charge, 4 | to any person obtaining a copy of this software and 5 | associated documentation files (the "Software"), to 6 | deal in the Software without restriction, including 7 | without limitation the rights to use, copy, modify, 8 | merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom 10 | the Software is furnished to do so, 11 | subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice 14 | shall be included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 18 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR 20 | ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scalable Secure Scuttlebutt 2 | 3 | * @dominictarr 4 | 5 | * scuttlebutt.nz 6 | 7 | --- 8 | 9 | # Scuttlebutt 10 | 11 | "anti-entropy" gossip protocol, part of amazon dynamo 12 | 13 | * "flow control for anti-entropy protocols" 14 | * eventually consistent 15 | * used by within a trusted system 16 | 17 | --- 18 | 19 | # Secure Scuttlebutt 20 | 21 | * peer id is public key 22 | * data model signed hash chains 23 | * peers can relay other's messages 24 | * cannot insert, modify, or reorder messages 25 | * large objects in optional "attachments" 26 | 27 | --- 28 | 29 | * subscribe ("follow") model solves sybil attacks 30 | * provides social discovery 31 | * messages arrive in natural order - easy to build database on top of 32 | 33 | --- 34 | 35 | # Scalable Secure Scuttlebutt 36 | 37 | * optimized for bandwidth and latency 38 | * overhead proportional to feeds updated 39 | 40 | --- 41 | 42 | # part 1: network topology 43 | 44 | --- 45 | 46 | # Star Network 47 | 48 | naive design: connect to everyone 49 | 50 | --- 51 | 52 | ``` js_exe 53 | return require('./demos').centralized() 54 | ``` 55 | 56 | --- 57 | 58 | # Random Network 59 | 60 | * connect to N > 2 peers ~= fully connected 61 | * tradeoff between bandwidth & latency 62 | 63 | --- 64 | 65 | ``` js_exe 66 | return require('./demos').random() 67 | ``` 68 | 69 | --- 70 | 71 | # Rules for gossip flooding 72 | 73 | send _new_ messages to everyone that didn't send it to you 74 | 75 | (don't resend messages you already know!) 76 | 77 | --- 78 | 79 | # message passing diagram 80 | 81 | ![message passing diagram](./message-passing2.jpg) 82 | 83 | --- 84 | 85 | # bandwidth complexity 86 | 87 | ``` 88 | O(messages*(spanning+2*redundant)) 89 | ``` 90 | --- 91 | 92 | # how many redundant connections 93 | 94 | * 1: 0% 95 | * 2: 66% 96 | * 3: 80% 97 | * 4: 86% 98 | * 5: 89% 99 | * 10: 95% 100 | 101 | at 5 connections per peer, we use 9 times as much bandwidth 102 | --- 103 | 104 | # Spanning Tree Network 105 | 106 | remove duplicate connections 107 | 108 | * when you receive an old message, disable that connection 109 | * bandwidth complexity now approaches O(messages) 110 | 111 | --- 112 | 113 | ``` js_exe 114 | return require('./demos').spanning() 115 | ``` 116 | 117 | --- 118 | 119 | # trees are fragile 120 | 121 | ``` js_exe 122 | return require('./demos').fragile() 123 | ``` 124 | 125 | 126 | --- 127 | 128 | # spanning tree 129 | 130 | * network partitions are really easy 131 | * we just added many points of failure! 132 | 133 | --- 134 | 135 | # Epidemic Broadcast Trees 136 | 137 | * (from paper with the same name) 138 | 139 | * best of flooding and spanning 140 | * we run a separate tree per feed 141 | --- 142 | 143 | # Eager and Lazy (aka push and pull) 144 | 145 | * connections have two modes: **eager** and **lazy** 146 | * switch redundant connections into **lazy** 147 | * **eager** mode pushes message immediately 148 | * **lazy** mode sends short _note_ that message exists 149 | 150 | --- 151 | 152 | * if you receive a known message, ask them to be **lazy** 153 | * (if in **lazy** mode, just send a short note) 154 | * if you receive a note for a new message, 155 | switch connection back to **eager** 156 | 157 | --- 158 | 159 | ![switching back to eager](./repair-lazy.jpg) 160 | 161 | --- 162 | 163 | # state per feed 164 | 165 | * feed id (string) 166 | * local sequence (integer) 167 | * remote sequence (integer) 168 | * local request (integer) 169 | * remote request (integer) 170 | * local mode (boolean) 171 | * remote mode (boolean) 172 | 173 | --- 174 | 175 | # part 2: handshakes 176 | 177 | --- 178 | 179 | # basic vector clock 180 | 181 | (as described in flow gossip paper) 182 | 183 | ``` 184 | Alice: {A: 1, B: 0, C: 3} 185 | 186 | Bob: {A: 0, B: 1, C: 3} 187 | ``` 188 | alice sends A1, Bob sends B1 189 | handshake size is O(feeds) 190 | 191 | --- 192 | 193 | # skipping vector clock 194 | 195 | Alice stores Bob's last vector clock 196 | 197 | ``` 198 | Alice (local): {A: 2, B: 2, C: 3} 199 | 200 | Bob (stored): {A: 0, B: 1, C: 3} 201 | ``` 202 | drops elements which havn't changed 203 | (Carol hasn't updated) 204 | 205 | --- 206 | 207 | > Note: Alice stores Bob's A:0 208 | even though she sent A1. 209 | unless Bob acknowledges A1, 210 | Alice doesn't know for sure he has it. 211 | 212 | --- 213 | 214 | # so Alice sends: 215 | 216 | ``` 217 | Alice: {A: 2, B: 2} 218 | 219 | Bob: {A: 1, B: 3} 220 | ``` 221 | Alice sends A2, Bob sends B3 222 | 223 | --- 224 | # if Bob then gossips with Carol 225 | 226 | ``` 227 | Alice: {} 228 | 229 | Bob: {C: 4} 230 | ``` 231 | 232 | Alice sends an empty clock, 233 | because she thinks nothing has 234 | changed 235 | --- 236 | # but Bob has an update from Carol 237 | 238 | ``` js 239 | Alice: {C: 3} 240 | ``` 241 | and Bob sends C4 242 | 243 | so, it costs one more message pass, 244 | but usually saves lots of requests. 245 | 246 | > note, we also save tracking this in memory 247 | 248 | --- 249 | 250 | # distribution of updates 251 | 252 | * power law is typical 253 | * a few active users 254 | * some moderate users 255 | * many occasional / infrequent users 256 | 257 | --- 258 | 259 | * basic scuttlebutt 260 | 261 | O(messages + feeds) 262 | 263 | * scalable scuttlebutt 264 | 265 | O(messages + updated_feeds) 266 | 267 | (updated_users is probably much smaller) 268 | 269 | reconnections are very cheap! 270 | 271 | --- 272 | 273 | 274 | # complexity overview 275 | 276 | bandwidth: O(messages + updated_feeds) 277 | 278 | latency: replication in 1 or 2 round trips 279 | 280 | --- 281 | 282 | # Part 3: Comparison to IPFS 283 | 284 | --- 285 | 286 | * replicate data without global consistency 287 | * secure scuttlebutt optimizes for chains 288 | * this is worst case for ipfs 289 | 290 | --- 291 | 292 | * ssb streams updates in order, per feed 293 | * ipfs has pointer to latest message 294 | then works backwards, one message at a time 295 | 296 | --- 297 | 298 | * ipfs better for arbitary partial datasets 299 | * ssb better for social applications 300 | * ssb: database vs ipfs: file system 301 | 302 | --- 303 | 304 | # Thank you! 305 | 306 | * learn more: https://scuttlebutt.nz 307 | * frontend: https://github.com/ssbc/patchwork 308 | * backend: https://github.com/ssbc/scuttlebot 309 | 310 | --- 311 | 312 | 313 | 314 | 315 | 316 | -------------------------------------------------------------------------------- /centralized.js: -------------------------------------------------------------------------------- 1 | var el = require('./demos').centralized() 2 | document.body.appendChild(el) 3 | 4 | el.dispatchEvent(new FocusEvent('focus')) 5 | -------------------------------------------------------------------------------- /demos.js: -------------------------------------------------------------------------------- 1 | var h = require('hyperscript') 2 | var random = require('network-animator/random') 3 | var animate = require('network-animator/animate') 4 | var G = require('graphreduce') 5 | var h = require('hyperscript') 6 | var random = require('network-animator/random') 7 | 8 | var W = 600, H = 600 9 | 10 | function delay (fn) { 11 | setTimeout(fn, 1000) 12 | } 13 | 14 | function canvas () { 15 | return h('canvas', {width: W, height: H}) 16 | } 17 | 18 | function starter (fn) { 19 | return function () { 20 | return fn(()=>false) 21 | 22 | } 23 | var stopped = false 24 | function stop () { 25 | return stopped 26 | } 27 | return function () { 28 | var d = h('div') 29 | d.addEventListener('focus', function () { 30 | stopped = false 31 | d.innerHTML = '' 32 | d.appendChild(fn(stop)) 33 | }) 34 | d.addEventListener('blur', function () { 35 | stopped = true 36 | }) 37 | return d 38 | } 39 | } 40 | 41 | function circular (n, center, radius) { 42 | var a = {}, theta = Math.PI*2/n 43 | for(var i = 0; i < n; i++) 44 | a[i] = {x: center.x + Math.cos(i*theta)*radius, y: center.y + Math.sin(i*theta)*radius} 45 | return a 46 | } 47 | 48 | function central (N, center, radius) { 49 | var loc = circular(N-1, center, radius) 50 | 51 | for(var i = N - 1; i > 0; i--) 52 | loc[i] = loc[i-1] 53 | 54 | loc[0] = center 55 | return loc 56 | } 57 | 58 | exports.centralized = starter(function (stop) { 59 | var c = canvas() // 60 | var g = {} 61 | var N = 200 62 | for(var i = 0; i < N; i++) { 63 | G.addEdge(g, 0, i) 64 | G.addEdge(g, i, 0) 65 | } 66 | 67 | // var loc = random.locations(N, c.width, c.height) 68 | var loc = central(N, {x:c.width/2, y:c.height/2}, Math.min(c.width/2, c.height/2)*0.9) 69 | var label = h('label') 70 | 71 | animate(g, loc, c.getContext('2d'), function next (_, packets) { 72 | var e = 0 73 | for(var i in packets) 74 | if(packets[i].extra) { 75 | e ++ 76 | delete g[packets[i].from][packets[i].to] 77 | delete g[packets[i].to][packets[i].from] 78 | } 79 | label.textContent = e + '/' + packets.length 80 | delay(function () { animate(g, loc, c.getContext('2d'), next) }) 81 | }) 82 | return h('div', c, label) 83 | }) 84 | 85 | exports.random = starter(function (stop) { 86 | 87 | var c = canvas() //h('canvas', {width: W, height: H}) 88 | var N = 200 89 | var g = random.graph(N, 3) 90 | var loc = random.locations(200, c.width, c.height) 91 | // var loc = circular(N, {x:c.width/2, y:c.height/2}, Math.min(c.width/2, c.height/2)*0.9) 92 | var label = h('label') 93 | 94 | animate(g, loc, c.getContext('2d'), function next (_, packets) { 95 | var e = 0 96 | for(var i in packets) 97 | if(packets[i].extra) e ++ 98 | label.textContent = e + '/' + packets.length 99 | delay(function () { animate(g, loc, c.getContext('2d'), next) }) 100 | }) 101 | return h('div', c, label) 102 | }) 103 | 104 | exports.grid = starter(function (stop) { 105 | 106 | var c = canvas() //h('canvas', {width: W, height: H}) 107 | var N = 200 108 | var g = {} 109 | var loc = {} 110 | for(var i = 0; i < N; i++) { 111 | if(i%10) 112 | G.addEdge(g, i, (N+i-1)%N) 113 | if((i%10) < 9) 114 | G.addEdge(g, i, (N+i+1)%N) 115 | if(i > 10) 116 | G.addEdge(g, i, (N+i-10)%N) 117 | if(i < N - 10) 118 | G.addEdge(g, i, (i+10)%N) 119 | 120 | loc[i] = {x: (i%10)*c.width/10, y: Math.floor(i/10)*c.height/20} 121 | } 122 | 123 | 124 | // var g = random.graph(N, 3) 125 | // var loc = random.locations(200, c.width, c.height) 126 | // var loc = circular(N, {x:c.width/2, y:c.height/2}, Math.min(c.width/2, c.height/2)*0.9) 127 | var label = h('label') 128 | 129 | animate(g, loc, c.getContext('2d'), function next (_, packets) { 130 | var e = 0 131 | for(var i in packets) 132 | if(packets[i].extra) e ++ 133 | label.textContent = e + '/' + packets.length 134 | delay(function () { animate(g, loc, c.getContext('2d'), next) }) 135 | }) 136 | return h('div', c, label) 137 | 138 | }) 139 | 140 | exports.spanning = starter(function () { 141 | var c = canvas() //h('canvas', {width: W, height: H}) 142 | var g = random.graph(200, 3) 143 | var loc = random.locations(200, c.width, c.height) 144 | var label = h('label') 145 | 146 | animate(g, loc, c.getContext('2d'), function next (_, packets) { 147 | var e = 0 148 | for(var i in packets) 149 | if(packets[i].extra) { 150 | e ++ 151 | delete g[packets[i].from][packets[i].to] 152 | delete g[packets[i].to][packets[i].from] 153 | } 154 | label.textContent = e + '/' + packets.length 155 | delay(function () { animate(g, loc, c.getContext('2d'), next) }) 156 | }) 157 | return h('div', c, label) 158 | 159 | }) 160 | exports.fragile = starter(function () { 161 | var N = 200 162 | var c = canvas() 163 | var loc = random.locations(N, c.width, c.height) 164 | var label = h('label') 165 | 166 | //;(function next () { 167 | var g = random.graph(N, 2) 168 | 169 | animate(g, loc, c.getContext('2d'), function next (_, packets) { 170 | var e = 0 171 | for(var i in packets) 172 | if(packets[i].extra) { 173 | e ++ 174 | delete g[packets[i].from][packets[i].to] 175 | delete g[packets[i].to][packets[i].from] 176 | } 177 | 178 | //randomly delete some nodes 179 | var d 180 | for(var i = 0; i < 1; i++) { 181 | var d = ~~(Math.random()*N) 182 | delete g[d] 183 | for(var i in g[d]) 184 | delete g[i][d] 185 | } 186 | 187 | label.textContent = e + '/' + packets.length 188 | if(!stop()) animate(g, loc, c.getContext('2d'), next) 189 | // delay(function () { animate(g, loc, c.getContext('2d'), next) }) 190 | }) 191 | 192 | //})() 193 | return h('div', c, label) 194 | 195 | }) 196 | 197 | var cmd = process.argv[2] 198 | 199 | 200 | document.body.appendChild(h('div', 201 | Object.keys(exports).map(name => { 202 | return h('div', 203 | h('h1', name), 204 | exports[name](), 205 | h('hr') 206 | ) 207 | }) 208 | )) 209 | /* 210 | if(module.main && exports[cmd]) 211 | document.body.appendChild(exports[cmd]()) 212 | else { 213 | console.log('expected command:'+Object.keys(exports).join(', ') + ' but got:'+cmd) 214 | // window.close() 215 | //process.exit(1) 216 | } 217 | 218 | */ -------------------------------------------------------------------------------- /grid.js: -------------------------------------------------------------------------------- 1 | var el = require('./demos').grid() 2 | document.body.appendChild(el) 3 | 4 | el.dispatchEvent(new FocusEvent('focus')) 5 | -------------------------------------------------------------------------------- /hops.js: -------------------------------------------------------------------------------- 1 | 2 | var G = require('graphreduce') 3 | 4 | function random (N, K) { 5 | var g = {} 6 | for(var i = 0; i < N; i++) { 7 | for(var j = 0; j < K; j++) { 8 | var a = ~~(Math.random()*i) 9 | g = G.addEdge(g, a, i) 10 | g = G.addEdge(g, i, a) 11 | } 12 | } 13 | return g 14 | } 15 | 16 | function round(n, u) { 17 | u = Math.pow(10, u) 18 | return Math.round(n*u)/u 19 | } 20 | 21 | function hops (g, start, seen) { 22 | var front = {}, total, tcount = 0, redun = 0 23 | 24 | function visit (hops) { 25 | var count = 0 26 | for(var k in front) { 27 | for(var j in g[k]) { 28 | if(seen[j] == null) { 29 | front[j] = true 30 | seen[j] = hops 31 | total += hops 32 | tcount ++ 33 | count ++ 34 | } 35 | else 36 | redun ++ 37 | } 38 | delete front[k] 39 | } 40 | return count 41 | } 42 | front[start] = true 43 | var hops = 0, total = 0 44 | while(visit(++hops)); 45 | 46 | return [tcount, hops, round(total/tcount, 3), redun, round((redun/(tcount-1)), 3)] 47 | } 48 | 49 | //for(var i = 0; i < 100; i++) { 50 | console.log('|K|peers|hops|avg|msgs|inefficiency|') 51 | for(var i = 1; i <= 20; i++) { 52 | var seen = {} 53 | var g = random(1000, i) 54 | console.log('|'+[i].concat(hops(g, 0, seen)).join('|')+'|') 55 | } 56 | //} 57 | 58 | -------------------------------------------------------------------------------- /images/redundant.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 19 | 21 | 29 | 35 | 36 | 44 | 50 | 51 | 59 | 65 | 66 | 74 | 80 | 81 | 89 | 95 | 96 | 104 | 110 | 111 | 119 | 125 | 126 | 134 | 140 | 141 | 149 | 155 | 156 | 164 | 170 | 171 | 179 | 185 | 186 | 194 | 200 | 201 | 209 | 215 | 216 | 224 | 230 | 231 | 239 | 245 | 246 | 254 | 260 | 261 | 269 | 275 | 276 | 284 | 290 | 291 | 299 | 305 | 306 | 314 | 320 | 321 | 329 | 335 | 336 | 344 | 350 | 351 | 359 | 365 | 366 | 367 | 391 | 393 | 394 | 396 | image/svg+xml 397 | 399 | 400 | 401 | 402 | 403 | 408 | 411 | 418 | A 434 | 441 | B 462 | 469 | C 490 | 496 | 502 | 508 | 514 | 521 | 528 | 535 | 542 | 3-node network broadcasting two redundant messages 553 | 554 | 557 | 564 | A 580 | 587 | B 613 | 620 | C 641 | 647 | 653 | 659 | 665 | 672 | 679 | 3-node network broadcasting two messages, two notes 690 | 696 | 702 | 703 | 714 | 719 | 724 | 730 | 735 | 736 | 741 | 747 | 752 | 753 | 759 | 765 | 771 | 777 | 782 | reactivate B-C after loosing connection A-C 793 | 798 | 801 | 806 | 811 | 812 | 817 | 823 | 828 | 833 | 838 | 839 | 842 | 847 | 852 | 857 | 858 | 863 | 868 | 873 | 878 | 879 | 885 | 890 | Subsequent messages without extra overhead 906 | 912 | 917 | 922 | 927 | 928 | 929 | 1. 945 | 2. 961 | 3. 977 | 4. 993 | 994 | 995 | -------------------------------------------------------------------------------- /images/test.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | --- 5 | 6 | 7 | 8 | 1093 | 1094 | -------------------------------------------------------------------------------- /measurements/overlap.js: -------------------------------------------------------------------------------- 1 | var path = require('path') 2 | 3 | var d = require('dynamic-dijkstra')(require('dynamic-dijkstra/simple')) 4 | 5 | var g = require(path.join(process.env.HOME, '.ssb', 'flume', 'contacts2.json')).value 6 | 7 | var HOPS = 2 8 | 9 | function first (o) { 10 | for(var k in o) return k 11 | } 12 | 13 | function compare (a, b) { 14 | 15 | } 16 | 17 | function diff (a, b) { 18 | var hits = 0, misses = 0, misses2 = 0, total = 0 19 | for(var k in a) 20 | if(a[k] > 0) { 21 | total ++ 22 | if(b[k] != null && b[k] > 0) { 23 | hits++ 24 | } 25 | else { 26 | misses ++ 27 | } 28 | } 29 | for(var k in b) 30 | if(b[k] > 0) { 31 | if(!(a[k] != null && a[k] > 0)) { 32 | total ++ 33 | misses2 ++ 34 | } 35 | } 36 | 37 | return { 38 | hits: hits, misses: misses, misses2: misses2, total: total, 39 | avg: hits/total 40 | } 41 | } 42 | 43 | var hops = d.traverse(g, null, HOPS, first(g)) 44 | var peers = {} 45 | for(var k in hops) { 46 | if(hops[k] > 0 && hops[k] == 1) { 47 | var _hops = d.traverse(g, null, HOPS, k) 48 | var data = diff(hops, _hops) 49 | data.hops = [0,0,0] 50 | for(var k in hops) 51 | if(hops[k] > 0 && _hops[k] > 0) { 52 | data.hops[_hops[k]] = (data.hops[_hops[k]] || 0) + 1 53 | peers[k] = (peers[k] || 0) + 1 54 | } 55 | 56 | console.log(k, data) 57 | 58 | } 59 | } 60 | var dist = {} 61 | for(var k in hops) 62 | if(hops[k] > 0) 63 | dist[k] = { hops: hops[k], peers: peers[k]} 64 | //console.log(dist) 65 | 66 | for(var i = 0; i < 100; i++) { 67 | 68 | covered = {} 69 | var oneHop = Object.keys(hops).filter(function (k) { 70 | return hops[k] === 1 71 | }).sort(function () { return Math.random () - 0.5}) 72 | .slice(0, 5).forEach(function (id) { 73 | var _hops = d.traverse(g, null, HOPS, id) 74 | for(var k in hops) { 75 | covered[k] = covered[k] || 0 76 | if(hops[k] > 0 && _hops[k] > 0) 77 | covered[k] ++ 78 | } 79 | }) 80 | 81 | function sum (a, b) { return (a || 0) + b } 82 | var times = [0,0,0,0,0,0], times2 = [0,0,0,0,0] 83 | for(var k in covered) { 84 | times2[covered[k]] = (times2[covered[k]] || 0) + 1 85 | if(hops[k] === 1) 86 | times[covered[k]] = (times[covered[k]] || 0) + 1 87 | } 88 | console.log(times) 89 | console.log(times2) 90 | var total = Object.keys(hops).length 91 | for(var k in times2) { 92 | var cuml = times2.slice(k).reduce(sum) 93 | console.log(k, times2[k], times2[k]/total, cuml, cuml/total) 94 | } 95 | } 96 | 97 | 98 | -------------------------------------------------------------------------------- /message-passing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dominictarr/scalable-secure-scuttlebutt/321c1d624beba2b23272eedc5ac8b34d20a34c57/message-passing.jpg -------------------------------------------------------------------------------- /message-passing2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dominictarr/scalable-secure-scuttlebutt/321c1d624beba2b23272eedc5ac8b34d20a34c57/message-passing2.jpg -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "scalable-secure-scuttlebutt", 3 | "description": "", 4 | "version": "1.0.0", 5 | "homepage": "https://github.com/dominictarr/scalable-secure-scuttlebutt", 6 | "repository": { 7 | "type": "git", 8 | "url": "git://github.com/dominictarr/scalable-secure-scuttlebutt.git" 9 | }, 10 | "dependencies": { 11 | "statistics": "^3.3.0" 12 | }, 13 | "devDependencies": { 14 | "graphreduce": "^3.0.4", 15 | "hyperscript": "^2.0.2", 16 | "marked": "^4.0.16", 17 | "network-animator": "^1.0.0" 18 | }, 19 | "scripts": { 20 | "test": "set -e; for t in test/*.js; do node $t; done" 21 | }, 22 | "author": "'Dominic Tarr' (dominictarr.com)", 23 | "license": "MIT" 24 | } 25 | -------------------------------------------------------------------------------- /paper.md: -------------------------------------------------------------------------------- 1 | 2 | # scalable secure scuttlebutt 3 | 4 | This is notes on a paper for ssb. 5 | 6 | ## assumptions about social networks 7 | 8 | TODO: find refrences to justify this assumption. 9 | 10 | * _power law assumption_: we expect activity to follow a power law. 11 | (a small proportion of users are update very frequently, 12 | a large number, only infrequently) 13 | 14 | * _tendency for local connections_: if you are "friends" with a peer 15 | it's likely that another friend is also a friend. 16 | 17 | * _hub connections assumption_: short paths to hubs: some users are "hubs", being connected 18 | with a large number of other peers. Paths to even distant users 19 | are short due connections via hubs. It's highly likely that 20 | someone you know follows any given celebrity. 21 | 22 | For simplicity, we model the following replication protocol designs in the context of connected random swarms. 23 | In practice, we do not want a design that replicates all messages in the entire network (especially because 24 | we intend it to scale to millions of users). For social media applications, users view the feeds of they 25 | have explicitly followed/friended. However, due to the strongly connected nature of the social graph 26 | (the primary way people meet is being introduced to friends of friends) the chance of having a considerable 27 | overlap in your follow graph with any friend is quite high. Thus, the simpler to model random network is a reasonable 28 | approximation to friend replication in a real social network - how good this approximation is is discussed in 29 | **TODO: write this** section. 30 | 31 | ## data models 32 | 33 | We use a simple data model that fits most social media application. 34 | The main resource is a _feed_, which is an append-only log of _messages_, 35 | without branches. Each peer may publish zero or more feeds, and subscribe 36 | to zero or more feeds. 37 | 38 | Each message contains the id of the feed, an always increasing sequence number, 39 | and some content. (Also, the hash of the previous message and a signature, but this paper focuses 40 | on the performance of our design, not the security, so we can leave this out of discussion for now) 41 | 42 | ``` 43 | Feed f = {id: f.id, log: [{ id: f.id, sequence, content }, ...]} 44 | ``` 45 | 46 | In this paper we can assume that each peer is the author of one feed and that the peer's id 47 | is also the id of that feed. The peer also stores the log of all the peers that it subscribes to. 48 | 49 | ``` 50 | Peer p = { id: p.id, feeds: { : f } } 51 | ``` 52 | 53 | ## comparison of replication algorithms. 54 | 55 | Starting with the simplest, develop models of data replication. 56 | >I basically just made up the O() notations... maybe this should be based on simulations instead? 57 | especially since some of my arguments depend on a certain factor 58 | being limited somewhat (by the nature of networks). 59 | 60 | ### polled scan: (RSS) Really Simple Syndication 61 | 62 | A publisher (`pub`, of type `Peer`) hosts content, and a subscriber (`sub`, also of type `Peer`) 63 | connect and the publisher sends their content. 64 | 65 | At each request, the publisher sends the entire feed. 66 | 67 | > (footnote: In practice, RSS truncates the feed and may not send older messages, 68 | so isn't provably eventually consistent, we've analyzed a simplified version, 69 | which has provable eventual consistency.) 70 | 71 | This is extremely simple to implement at the server end (RSS provides an XML file over HTTP) 72 | and slightly more complex at the client end, as clients append only the new values. 73 | It's assumed that messages are fairly small, text only, and large files are referenced as some sort of link. 74 | 75 | When a subscriber connects, the publisher replies `received = pub.feeds[pub.id]` 76 | (which means sending `pub.feeds[pub.id].length` messages) 77 | the subscriber then appends any new messages to their copy of that feed. 78 | `sub.feeds[pub.id].append(received[sub.feeds[pub.id].length...])` such that both copies of the feed are the same, 79 | that is, contain copies of the same messages. `sub.feed[pub.id] == pub.feed[pub.id]`. 80 | 81 | New messages are published over time, and so the subscriber periodically makes a request to each publisher. 82 | 83 | ``` 84 | interval(sub.pollFrequency, () => sub.feeds.each(id => sub.connect(id)) ) 85 | ``` 86 | 87 | So, every `sub.pollFrequency` all publishers are connected to and all messages from them are downloaded, 88 | old messages end up being sent many times unnecessarily, so the amount of bandwidth needed scales very badly. 89 | 90 | Bandwith needed for a subscriber can be calculated as the following: 91 | 92 | > (footnote: Assume that `pollFrequency` is number of polls 93 | within the given timeframe that we are calculating resource usage for. The important thing is how many polls are made. 94 | If we were to calculate usage per day, and there was one poll per day, pollFrequency is 1. In any case, we are 95 | more interested in exploring the relationship between the various design factors and resources used, so the important 96 | thing to observe here is that the total resource used is _multiplied_ by `pollFrequency`, doubling `pollFrequency` doubles 97 | the number of messages sent) 98 | 99 | ``` 100 | total_messages = sum(map(sub.feeds, id => sub.feeds[id].length)) 101 | sub.pollFrequency * total_messages 102 | ``` 103 | Each interval, the subscriber polls every publisher, and receives all messages. 104 | Hence the total set of messages is redownloaded every interval. 105 | 106 | Bandwith needed for the publisher can be calculated as the following: 107 | 108 | ``` 109 | subscribers = sum(peers, peer => peer.feeds[pub.id] ? 1 : 0 )) 110 | avg_poll_frequency = sum(peers, peer => peer.feeds[pub.id] ? peer.pollFrequency : 0 )) / subscribers 111 | subscribers * avg_poll_frequency * pub.feed[pub.id].length 112 | ``` 113 | 114 | Clients have a tradeoff between bandwidth and latency. Either they use lots of bandwidth 115 | or wait a long time for new messages. So this design is not suitable for real-time communication. 116 | 117 | For publishers, this design also suffers from uncontrollable expenses. If there are suddenly 118 | many subscribers, or they set their `pollFrequency` very high, this increases costs for the 119 | publisher, which in practice will lead to outages. Thus the most popular content is the most 120 | likely to be unavailable, which is the opposite of what is needed. 121 | 122 | Also, this model uses a network connection per poll, is likely to be a 123 | limiting factor for publishers with large numbers of subscriptions. 124 | 125 | The total number of network connections over some time period 126 | is for the subscriber: 127 | 128 | `avg_poll_frequency * sub.feeds.length` 129 | 130 | and the publisher: 131 | 132 | `poll_frequency * subscriptions` 133 | 134 | ## append-only poll 135 | 136 | Messages in a feed have a total order defined by an always increasing 137 | value such as a sequence, such that any message's sequence is strictly greater 138 | than any preceding message. If the sequence number of the first message is 1, 139 | then the number of messages in the feed (`feed.length`) is also the sequence number of the last item. 140 | 141 | > (footnote: By sending messages in order, if a transmission fails part-way, 142 | the requester's copy of the feed is still a valid append-only log with no gaps - but their latest 143 | message is just not the true latest message. Next time they connect they will receive the missing messages.) 144 | 145 | Instead of sending all messages per poll, the subscriber requests all messages greater 146 | than the sequence number of the latest message they currently have. 147 | This requires sending on a tiny header (the sequence number) 148 | and the publisher only sends each message to each subscriber once. 149 | 150 | The publisher expects a sequence number, and returns any messages greater than that. 151 | ``` 152 | pub.serve(n => pub.feeds[pub.id][n...]) 153 | ``` 154 | 155 | The subscriber connects to a pub, and appends the messages the pub returns to their copy, 156 | 157 | ``` 158 | received = sub.connect(pub.id, sub.feeds[pub.id].length) 159 | sub.feeds[pub.id].append(received) 160 | ``` 161 | now the subscriber is consistent with the publisher. 162 | 163 | > (footnote: The publisher sends the messages in order, so if a connection fails part-way 164 | through, the subscriber's copy still has sequential messages 165 | 166 | The cost for the subscriber is as follows 167 | 168 | ``` 169 | sub.pollFrequency * sub.feeds.length + total_messages 170 | ``` 171 | 172 | This is a significant improvement over polled scan because each message is only downloaded once. 173 | However, the subscriber must still send their current sequence number to each publisher, on each poll. 174 | Although we can resonably assume that the sequence number is significantly smaller 175 | than a message, if the `pollFrequency` or `sub.feeds.length` is high this can become significant. 176 | 177 | The number of connections needed are the same as polled scan. 178 | 179 | For a suddenly popular publisher, many incoming requests can still lead to availability problems, 180 | as the simple number of requests becomes overwhelming, although because the entire feed of messages 181 | does not need to be sent the practical limit is much higher. 182 | 183 | ## append-only gossip (scuttlebutt) 184 | 185 | In a gossip protocol, instead of subscribers polling publishers, 186 | "peers" which can be both publisher and subscriber, connect to each other randomly. 187 | On each connection, instead of requesting a single feed, peers send a "vector clock". 188 | Instead of representing a global sequence, a vector clock just includes the sequence on each 189 | peer that contributed to the state. A peer's current vector clock is just a map of the lastest 190 | sequence of each feed: 191 | 192 | ``` 193 | vector_clock = map(peer.feeds, id => peer.feeds[id].length) 194 | ``` 195 | 196 | When a peer receives the remote vector clock, they can simply calculate whether there are 197 | any messages they need to send and send them. 198 | 199 | ``` 200 | peer.serve(clock => mapValues(clock, (id, sequence) => peer.feeds[id][sequence...])) 201 | ``` 202 | 203 | A client just connects to a random peer, sends their clock, and appends messages they receive 204 | 205 | ``` 206 | each( 207 | peer.connect(random_peer.id, vector_clock), 208 | msg => peer.feeds[msg.id].append(msg) 209 | ) 210 | ``` 211 | 212 | Since a connection now sends the list of subscriptions, 213 | but only needs to connect to a single peer each poll interval, 214 | more bandwidth is used per connection, but less connections are used. 215 | The overall bandwidth used by a peer is the same as with append-only poll, 216 | but the number of connections is now only `O(poll_frequency)`. 217 | 218 | Because messages are no longer passed directly from the publisher to each subscriber, 219 | describing the time needed to disseminate a new message is more complicated. 220 | In the first poll interval, the publisher will be connected to at least 1 other peer. 221 | (The publisher makes 1 outgoing connection, but may receive any number of incoming connections.) 222 | If it gets passed to only a single peer, but in the second poll interval, there are now two peers able 223 | to disseminate the message. If they do not connect again, in the 3rd interval 224 | there will be 4 peers, and so on in powers of 2. However, as the number of peers 225 | with a given message increases the chance that any two connecting peers already both have the 226 | message increases too, and the rate of dissemination decreases. Thus overall rate 227 | of dissemination resembles an S curve. Since calculating the actual rate of dissemination 228 | is more complicated, and is affected by practical matters such as the probability that 229 | multiple peers connect a particular peer at once, instead of calculating 230 | the time, we take measurements from a simple simulation. 231 | 232 | The pattern of dissemination of a single message is the same as flooding gossip. 233 | For a random network with 10,000 peers and each peer creating a connection to one 234 | other peer randomly each interval (so a given peer may receive zero or more incoming connections, 235 | but makes only one outgoing connection), the total number of intervals needed 236 | to diseminate a single message is very small compared to the number of peers. 237 | 238 | ``` 239 | round, dR, dT 240 | 1, 9, 10 241 | 2, 51, 61 242 | 3, 293, 354 243 | 4, 1195, 1549 244 | 5, 3903, 5452 245 | 6, 3875, 9327 246 | 7, 666, 9993 247 | 8, 7, 10000 248 | ``` 249 | 250 | In Amazon Dynamo, this protocol design is used to replicate 251 | membership information within a cluster of Dynamo nodes. 252 | The peers run inside a trusted enviroment, and all peers replicate 253 | all other peers. To add a peer to the network, that peer just 254 | needs to know any other peer. It's not necessary to inform 255 | any master node, and the cluster is highly resilient. 256 | 257 | This design has a significant advantage with availability. 258 | If a peer that originated a message goes offline, if they 259 | have disseminated a message to at least one other peer that message 260 | will continue to flood the network. If a publisher suddenly 261 | becomes very popular, it will not cost them extra resources, 262 | because it's the other peers which will provide the dissemination. 263 | 264 | ## update frequency, overlap, and peer selection 265 | 266 | In Amazon Dynamo, scuttlebutt replication is used as a subcomponent 267 | of the whole system - to keep track of the membership in the 268 | database cluster, and what range of the database each node is 269 | responsible for. When database requests come to a node, that 270 | information is used to route the request to nodes which can handle 271 | it. Each node therefore needs to replicate _all information_ about 272 | membership in the cluster, and also, that information must be kept 273 | continually up to date. Each node emits a regular heartbeat and 274 | this is gossiped across the cluster, and the other nodes use this 275 | information to calculate the probability that a given node is still 276 | active - thus wether requests should be routed to it. 277 | 278 | Other applications using are likely to differ in terms of whether 279 | peers need to replicate the entire dataset, or the regularity with 280 | which they broadcast updates, or both. For example, a chat 281 | application combines messages from everyone one in the "room", so 282 | each peer replicates the entire dataset, but each peer only 283 | writes messages to the chat as frequently or infrequently as they 284 | choose to. It's quite likely that a few peers write very frequently 285 | and others read but do not write, or write very little. 286 | 287 | Indeed, in most real world applications, not all updates are 288 | created on a regular basis. There may be a small number of people 289 | you communicate with frequently - the closest family and friends, 290 | but then a broad range of aquaintances that you speak with 291 | occasionally. This pattern, known as a power-law distribution, 292 | is frequently found in both natural and artificial phenomena. 293 | Books and Movies are dominated by a small amount of best sellers, 294 | but also a large number of cult classics, flops, or break-evens. 295 | Most companies fail in the first few years, but a small number 296 | become so successful that it offsets venture investments 297 | in all the companies that fail. Likewise, it's reasonable to 298 | expect that most applications, if they do not have an explicitly 299 | regular update pattern, such as a enviromental sensor, will 300 | probably have activity following a power law, in the distribution 301 | of updates. However, if many peers have only infrequent update, 302 | it's likely that any two peers will exchange vector 303 | clocks with mostly the same values, and this is wasted bandwidth. 304 | 305 | The other question, what portion of the dataset should be replicated 306 | to each node? In Dynamo, or the chat room, the replicated data is replicated to 307 | all nodes, but in most other applications, it's not really diserable 308 | for all peers to have all data. For example, in email, the only peers that really need a particular 309 | message are the sender and the receiver (mail forwarding agents are a necessary evil) 310 | 311 | Email is probably not suited to a replication pattern, as only 312 | the recipient and sender are intended to have a use for a given 313 | message, and email has enough problems with spam that replicating 314 | 3rd party messages seems strange. On the other hand, social media, 315 | seems extremely well-suited to a replication design: firstly, 316 | content is already log-oriented. typically, users explicitly 317 | "follow" or "friend" each other, and the main user interface element 318 | is viewing a combined feed of all follow's messages. "shares" 319 | are broadcast, usually intended to be read by all followers/friends. 320 | Each peer may want to only replicate their friend's data, but 321 | since the main way of meeting new friends is by meeting your friend's 322 | friends, there is a good chance that any friend also holds messages 323 | you wish to replicate. 324 | 325 | If less than the entire dataset is to be replicated to each peer, 326 | this also raises the question of _which peers to connect to?_ 327 | in email, this is not an easy question to answer, as any one knowing 328 | your email address can send you messages. On the other hand, 329 | social media applications present an elegant answer to this question: 330 | The peers you follow are the peers you should connect to, you 331 | are likely to share mutual friends with them, and thus they are 332 | likely to have the feeds you are looking for, and want the feeds 333 | you have. 334 | 335 | A social media application provides good an simple ways 336 | to both choose a partial dataset to replicate and choose who to 337 | replicate it with, and because of the high degree of connectivity 338 | of the social graph, it seems extremely likely that such an 339 | application built on top of an efficient gossip replication protocol 340 | could easily scale to an unlimited number of users. Provided 341 | the implementation can scale to the needs of most individual users, 342 | each user's data overlaps with their friends, and thus the network 343 | could easily cover the entire globe. 344 | 345 | The design we come up with here could be used in any application 346 | that needs to replicate data with a few thousand peers, wether 347 | the dataset be shared fully, or having a well defined overlap. 348 | We present a social media application only as a relatively 349 | flexible base-architecture. 350 | 351 | ## append-only gossip with request-skipping 352 | 353 | In practice, activity in most datasets follows a power law: 354 | some authors are highly prolific, but most only publish rarely. 355 | Thus, it is likely that when two peers exchange a vector clock in 356 | append-only gossip, the majority of feeds mentioned have not changed. 357 | 358 | > (footnote: Indeed, this became a practical problem in secure-scuttlebutt, 359 | on each connection, each peer sending over half a megabyte of requests, 360 | yet not actually needing to send any messages.) 361 | 362 | The chance that no new messages are sent during a connection increases 363 | with `poll_frequency`. 364 | 365 | _request-skipping_ is an optimization to avoid making feed requests if it seems unlikely 366 | that a feed has changed, it requires storing the received clock from remote peers, 367 | but saves sending many headers after the first connection. 368 | 369 | On the first connection between two peers, the entire clock is sent, but on subsequent connections, 370 | the current clock is compared with the stored copy of the remote clock, and only the feeds that differ are sent. 371 | 372 | ``` 373 | // first connection 374 | local_clock = map(peer.feeds, id => peer.feeds[id].length) 375 | // take the stored remote clock, or an empty clock if this is the first connection. 376 | remote_clock = peer.clocks[remote.id] || {} 377 | conn = peer.connect(remote.id) 378 | 379 | conn.send(filter(local_clock, (id, seq) => remote_clock[id] != IGNORE && remote_clock[id] != seq)) 380 | 381 | remote_clock2 = conn.recv() 382 | remote_clock = peer.clocks[remote.id] = merge(remote_clock, remote_clock2) 383 | 384 | // if they have requested feeds we did not send, send our current seq for those feeds. 385 | conn.send(map( 386 | filter(remote_clock2, (id, seq) => local_clock[id] != seq), 387 | id => local_clock[id] || IGNORE 388 | )) 389 | 390 | // finally, send any needed messages 391 | conn.send(mapValues(remote_clock, (id, seq) => if local_clock[id] > seq && seq != IGNORE then peer.feeds[id][seq...])) 392 | each(conn.recv(), msg => peer.feeds[msg.author].append(msg)) 393 | ``` 394 | 395 | `IGNORE` is a special value used to indicate that the remote has requested a feed that we choose not to replicate. 396 | It is necessary to make a definite response in this case, because this enables the remote to remember we are not interested 397 | in this feed, and so they will avoid requesting this feed next time they respond. 398 | 399 | Once we receive the remote's clock and have compared it to the stored copy, 400 | we can calculate everything that needs to be send or received. In practice, 401 | long-lived connections are used, and we allow new clocks to be sent at any time, 402 | but for simplicity of describing the algorithm we represent it here as having 5 phases: 403 | send initial clock, receive remote clock, send response clock, send messages, receive messages. 404 | 405 | > (footnote: It is essential that we only update our record of the remote clock with data they have explicitly sent 406 | us, and _not_ based on the messages we have sent them. It is possible that a connection fails before 407 | our peer receives a message, but if they send us something we know they meant it.) 408 | 409 | If peers A and B are consistent with respect to feed X, neither will mention X the next time they connect. 410 | However, if either peer receives a new message in X, one of them will mention it and the other will respond, 411 | and the first will send the message. If both receive the new message before they next reconnect, they'll both 412 | mention it, but see they are at the same message and not send it. 413 | 414 | If peer A requests a feed id X that B has not chosen to replicate, B receives `X: ` from A, 415 | and will reply with `X: IGNORE`. 416 | A will store `A.clocks[B.id][X] = IGNORE`, and B will store `B.clocks[A.id][X] = `. 417 | `IGNORE` is never sent in the initial clock, only in the response. If B later chooses to replicate X, 418 | the next time they connect to A, they'll check their current sequence (which will be 0 at the time they choose 419 | to replicate X), 420 | against the stored clock for B. They'll see that it's different and send `X: 0` 421 | in the initial clock. A will then see that B is no longer ignoring X, and will respond with their 422 | sequence for X. If B doesn't change their mind about X, A will never mention it again. 423 | 424 | > (footnote: In the case that B decides to replicate X, but somehow ends up with the same sequence 425 | that A has for X, then they won't mention it, however, sooner or later, they will receive a new 426 | message in X from someone else, and after this will mention it to A) 427 | 428 | The worst case, for two given peers exchanging a single feed, is when the poll frequency 429 | is greater or equal to the frequency that new messages are added. This means that each 430 | peer sends a vector clock element for every message added to that feed, so the maximum 431 | number of vector clock elements is the same as the number of messages sent. If the poll 432 | frequency is lower than the message frequency, efficiency increases as each vector clock 433 | element will correspond to potentially many messages. Since this at worst a constant 434 | factor of the number of messages, it's within acceptable bounds and poll frequency can be 435 | selected for maximum availability without trading off bandwidth usage. 436 | 437 | It is expected that in practice, message frequency differs greatly by feed. 438 | Request skipping saves sending vector clocks elements for infrequently updating 439 | feeds, so a great deal less vector clock elements need be sent than in append-only gossip, 440 | especially when using high poll frequencies. 441 | 442 | ``` 443 | messages + (peers_connected_to * peer.feeds.length) + (peer.pollFrequency / messages) 444 | ``` 445 | 446 | There is now only one multiplicative factor in the bandwidth complexity. 447 | We must send the entire vector clock to each peer that we will connect to, 448 | the first time we connect to them. However, luckily, to get provable eventual 449 | consistency, we do not actually need to connect to every peer. As messages 450 | are relayed, we only need the eventual connections to form a connected graph, 451 | _not_ for each peer to eventually connect. Consequently, a value for 452 | `peers_connected_to` can be somewhat smaller than the whole swarm. 453 | 454 | Simulating random networks with varying numbers of random connections, the 455 | measured probability that the graph is fully connected rapidly approaches 1 456 | as the average number of connected peers passes 2. As the number of edges 457 | continues to rise, the distance across the graph (and thus dissemination rate) 458 | drops. 459 | 460 | ``` 461 | edges, P(connected), average, stdev 462 | 1, 0.05, 57.26, 19.385365614297818 463 | 1.1, 0.46, 23.33, 2.549725475418886 464 | 1.2, 0.69, 18.1, 1.6763054614240047 465 | 1.3, 0.7, 15.08, 1.188949115816149 466 | 1.4, 0.8, 13.52, 1.2765578717786399 467 | 1.5, 0.91, 12.33, 0.8130805618141443 468 | 1.6, 0.9, 11.45, 0.82915619758885 469 | 1.7, 0.96, 10.59, 0.8011866199581761 470 | 1.8, 0.97, 9.83, 0.6333245613427602 471 | 1.9, 0.99, 9.29, 0.4958830507287036 472 | 2, 1, 8.72, 0.5306599664568481 473 | 3, 1, 6.91, 0.2861817604250792 474 | 5, 1, 5.39, 0.48774993593029137 475 | 10, 1, 4.59, 0.4918333050943186 476 | 20, 1, 4, 0 477 | ``` 478 | 479 | I would suggest using a fixed number of connections per peer in the range 5-10, 480 | would effectively gaurantee a fully connected network, and small dissemination rate, 481 | without scaling the number of full vector clocks to be sent by very much. 482 | 483 | Also note, this design requires storage of vector clocks, so reducing the number 484 | of peers connected to also keeps that within acceptable bounds. 485 | 486 | ## overlapping replication sets 487 | 488 | So far, we have analyzed the problem space as if all peers under consideration 489 | are replicating the same set of publishers. In some application designs it 490 | may make sense for all peers to replicate the same set of feeds, for example, 491 | in a task tracking system within a medium sized company or other organization. 492 | On the other hand, the really interesting use-cases are ones that scale to millions 493 | of users, and so it might not feasible to replicate all their data on the average device, 494 | even if you did want to. In secure-scuttlebutt, the target application is a social network. 495 | This provides an interesting middle ground, with both a fair amount of overlap and a 496 | reasonable expectation of it, since one of primary ways that people meet new friends 497 | is by meeting friends of friends. These encounters might be more or less formal, 498 | but nevertheless, the chance that any two friends have a number of mutual friends in 499 | common is fairly high. 500 | 501 | In the most conservative design, it might be desired to replicate only the direct 502 | friends "followed" by the user. If the follow graph is known, a set of replication 503 | peers can be carefully selected to ensure coverage of all follows. For each feed 504 | a remote peer follows that the local peer does not, an feed id and `IGNORE` will be sent, 505 | but after that, subsequent requests for that feed will be skipped. 506 | 507 | In the current secure-scuttlebutt design, by default peers replicate their friends, 508 | and the friends of their friends. Sampling the actual ssb data, choosing 5 random 509 | peers to replicate, and replicating feeds two hops out on the follow graph (friends, 510 | and friends of friends), in all samples, all the direct friends of the user were 511 | within 2 hop range of the 5 random peers, also on average ~75% (TODO: GRAPHS THESE) 512 | of friends of friends were replicated by at least one peer. In ssb, since this could 513 | be more carefully optimized by selecting peers carefully to maximize coverage, and 514 | since request-skipping means we'll only send headers for unreplicated feeds one time, 515 | we can just connect to more random feeds and still get acceptable efficiency. 516 | 517 | ## real-time broadcast 518 | 519 | It is obviously desirable that a communication network would 520 | carry messages quickly. For human to human text communication, 521 | latency within a few seconds is usually sufficient. However, 522 | most of the above replication strategies would be unviable 523 | with `poll_frequency` of a few seconds, not to mention, establishing 524 | a TCP connection has overhead, and several extra messages must be 525 | passed to make that an encrypted TCP connection. So, instead of 526 | simple polling, we should have connections with a longer lifespan - 527 | when a new connection is formed we exchange clocks and receive any 528 | old messages we are mssing, via the above polling algorithms, 529 | but then we "stay on the line", and if our peer receives any 530 | additional messages they send those too. 531 | Thus, we our model becomes _sync then broadcast_. 532 | 533 | In the non-gossip models, we must eventually connect to every 534 | peer we subscribe to. It would be unviable to hold long-lived 535 | connections to every peer, as they may number in the thousands, 536 | and the overhead of a each connection would be too much for 537 | most user devices. But with gossip, we can connect to just a small 538 | number of peers at a time and still receive messages from many peers. 539 | 540 | ## random connected network 541 | 542 | N peers are randomly connected with average K outgoing connections per peer. 543 | (outgoing, because each peer randomly chooses to connect to K other 544 | peers) as discussed in the previous section, the chance that the network 545 | is fully connected rapidly approaches 1 when as K approaches 2, and 546 | then the average shortest path between nodes shortens as redundant connections increase. 547 | For the network to broadcast a message, the originating peer sends it to all 548 | neighbouring peers, and when a peer receives a _new_ message, 549 | they send it to all their connected peers except the peer they received 550 | the message from. Consider a network with 3 peers and 2 connections each. 551 | A creates a new message and transmits a message to B and C, and B and C then 552 | transmit the message to each other. Thus the message is sent twice 553 | by A and once each by B and C. The total bandwidth used by the 554 | network is 4. Since A creates the message and there are only 555 | two other peers, only the transmissions to B and C are necessary, 556 | but B and C don't know that the other already has the message. 557 | 558 | Simulating a broadcast in a random network with up to 20 connections 559 | per peer, and measuring hops, average hops, messages transferred: 560 | 561 | |K|peers|hops|avg|msgs|inefficiency| 562 | |-|-----|----|---|----|------------| 563 | |1|1000|14|6.657|999|1| 564 | |2|1000|7|3.657|2981|2.984| 565 | |3|1000|6|2.944|4947|4.952| 566 | |4|1000|5|2.842|6913|6.92| 567 | |5|1000|5|2.605|8861|8.87| 568 | |6|1000|5|2.515|10803|10.814| 569 | |7|1000|4|2.388|12731|12.744| 570 | |8|1000|4|2.361|14671|14.686| 571 | |9|1000|4|2.306|16605|16.622| 572 | |10|1000|4|2.193|18487|18.506| 573 | |11|1000|4|2.201|20357|20.377| 574 | |12|1000|4|2.136|22237|22.259| 575 | |13|1000|4|2.118|24163|24.187| 576 | |14|1000|4|2.118|25993|26.019| 577 | |15|1000|4|2.027|27877|27.905| 578 | |16|1000|4|2.008|29709|29.739| 579 | |17|1000|4|2.046|31567|31.599| 580 | |18|1000|4|1.994|33393|33.426| 581 | |19|1000|4|1.94|35281|35.316| 582 | |20|1000|4|1.933|37135|37.172| 583 | 584 | > (footnote: With 1000 peers and one connection we only need to send 585 | 999 messages because the first peer is the author of the message 586 | and did not need to send it.) 587 | 588 | Note, with more than one connection, number of hops (which is 589 | the time taken for the last message to arrive) decreases slowly, 590 | but the average case, time for 50% of the network to receive the message, 591 | decreases much quicker and the (bandwidth) 592 | inefficiency increases fastest. 593 | With K=2, nearly 3 times as many messages as necessary are sent. 594 | and with K=5, nearly 9 times too many messages are sent! 595 | 596 | So with a simple flooding design, we pay a lot in bandwidth for reducing latency. 597 | 598 | If we were to prune the redundant connections, we could get low latency 599 | without bandwidth overhead. However, since a pure spanning 600 | tree has no redundency it's also very fragile. If one connection close 601 | to the root of the tree (the originator of a message) fails, all downstream 602 | peers will be cut off. 603 | 604 | ## spanning trees 605 | 606 | Epidemic broadcast trees (EBT) is an algorithim to form a spanning tree from 607 | a random network, but instead of completely removing redundant connections, 608 | they are just moved into a _lazy_ or _pull_ state. When in the lazy state, 609 | only headers (equivalent to vector clock elements) are sent. Which connections 610 | are redundant can be detected by each peer observing the order in which they 611 | first receive a message, and thereafter observing latency. For example, in the 612 | 3 node network discussed in the previous section, A transmits a message to B and C, 613 | neither of them have received this message before, so they know that their connection 614 | to A is not redundant. Then, they each receive a second copy of the message from B,C 615 | so they both know that for messages from A, the connection between B-C is redundant. 616 | So, B and C exchange short messages each requesting the other to disable that connection (for messages from A). When A broadcasts another message, B and C receive 617 | it directly from A again, but since the redundant connections are disabled, they do not 618 | transmit it again. Instead, they only send a short message, equivalent to a vector clock 619 | element, to indicate they know this message exists. If later, the connection between 620 | A and C breaks, and A broadcasts another message. It will only be received by B. 621 | B then sends the short lazy check to C, who then realizes that this is the first they 622 | have heard about this message - therefore, B must now be closer to the source than they are. 623 | C then sends a message to re-request active transmission of messages from A, and B sends 624 | the message to C. (note, re-establishing an active connection takes just one round-trip) 625 | 626 | ![redundant messages](./images/redundant.svg) 627 | 628 | EBT still sends redundant data, but the notes sent along the redundant connections 629 | are significantly smaller than the messages. Also, if a delay is introduced, 630 | it is not necessary to send a note for every message, but just the latest message. 631 | If several are received in quick succession, only one note needs to be sent. 632 | Also, if a random factor, somewhat greater than round trip time is added, 633 | then 50% of the time the same note is received before it is sent. 634 | 635 | For example, B and C receive the message from A at approximately the same time, 636 | if B decides to wait one second, and C waits two seconds, and the note from B to C arrives in 637 | 0.1 seconds, C knows that B already knows about that message, and now does not need to send a note back. 638 | 639 | # singleton hub 640 | 641 | > (footnote: To make the strongest arguement for the performance of EBT + request-skipping, 642 | > compare it to a fully centralized model.) 643 | 644 | To this point, most social networks have been implemented 645 | along a star shaped network. Essentially one peer that distributes 646 | all messages to all peers. If this was designed around a replication 647 | protocol, a client would use something like the append-only poll, 648 | except the server would remember each client's vector clock at each timestamp, 649 | all their subscriptions, and the client would only send the time they last synced. 650 | The server would then send all new messages on any of their subscriptions. 651 | 652 | On each connection, the client needs to send their last connection time, 653 | and the server still has to send each message. If a client polls at low rate, 654 | the client sends one header and receives many messages. If the client 655 | polls at a high rate, maybe they make one request per message. (Long-lived 656 | connections would also help here.) 657 | 658 | They would request the sequence number representing 659 | their own read feed, on each connection they'd request any messages 660 | that have occured since the last connection, but the central server 661 | still has to send the messages. 662 | 663 | `O(poll_frequency + messages)` 664 | 665 | the central server of course, must pay for a lot of resources, 666 | 667 | bandwidth: 668 | 669 | `O(network_clients * poll_frequency + peers * messages)` 670 | and connections: 671 | 672 | `O(network_peers * poll_frequency)` 673 | 674 | If a network is successful, `network_clients` can easily get very very 675 | large: millions or billions of clients. 676 | 677 | ## conclusion 678 | 679 | An idealized centralized network is presented as the best possible in efficiency, 680 | yet it only beats our design by a constant factor. Between EBT with a fixed number 681 | of peers and request-skipping, we can manage the bandwidth performance, but the main difference 682 | is only in vector clock elements, which are very small compared to messages. 683 | 684 | In the current secure-scuttlebutt implementation, which uses base64 encoded strings to 685 | encode 256 bit public keys plus a base 10 integer, vector clock elements are about 60 bytes, 686 | and the average message is 660 bytes (although maximum message is 8kb) so the average message is 11 times 687 | bigger than a single vector clock element. 688 | 689 | I would expect, that for a typical peer, most messages would be replicated after being offline for a while, 690 | so one vector clock element brings in many messages. For messages replicated in real-time, 691 | the extra bandwidth used is managed by limiting the number of connections. 692 | 693 | The performance of our design is close enough to the optimal centralized system to realistically 694 | argue that it's viable at massive scale. In practice, we believe that small difference will easily 695 | be made up by all the other advantages by adopting a decentralized system. 696 | For example, the significant costs associated with running such a system are now spread around the 697 | network participants evenly. With a fully decentralized gossip protocol, peers can join in any 698 | topology. If two peers are offline, but nearby each other, it is possible for them to share data 699 | directly over bluetooth, wifi, or by directly exchanging physical media. This means secure-scuttlebutt 700 | is potentially able to service remote areas of the earth that have not yet received modern infrastructure, 701 | as well as areas where that infrastructure is disrupted by warfare or other disasters. 702 | 703 | 704 | 705 | 706 | -------------------------------------------------------------------------------- /random.js: -------------------------------------------------------------------------------- 1 | var el = require('./demos').random() 2 | document.body.appendChild(el) 3 | 4 | el.dispatchEvent(new FocusEvent('focus')) 5 | -------------------------------------------------------------------------------- /references.md: -------------------------------------------------------------------------------- 1 | 2 | Efficient Reconcilliation and Flow Control for Anti-Entropy Protocols 3 | https://www.cs.cornell.edu/home/rvr/papers/flowgossip.pdf 4 | 5 | Dynamo: Amazon's Highly Available Key-value Store 6 | https://www.allthingsdistributed.com/files/amazon-dynamo-sosp2007.pdf 7 | 8 | Timestamps in Message-Passing Systems That Preserve Partial Ordering 9 | http://zoo.cs.yale.edu/classes/cs426/2012/lab/bib/fidge88timestamps.pdf 10 | 11 | RSS specification 12 | https://cyber.harvard.edu/rss/rss.html 13 | 14 | Atom Specification 15 | https://tools.ietf.org/html/rfc4287 16 | 17 | Epidemic Broadcast Trees 18 | http://www.gsd.inesc-id.pt/~ler/reports/srds07.pdf 19 | 20 | 21 | -------------------------------------------------------------------------------- /repair-lazy.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dominictarr/scalable-secure-scuttlebutt/321c1d624beba2b23272eedc5ac8b34d20a34c57/repair-lazy.jpg -------------------------------------------------------------------------------- /simulation/connected.js: -------------------------------------------------------------------------------- 1 | var stats = require('statistics') 2 | 3 | var N = 1000, M = 5 4 | 5 | function isEmpty (o) { 6 | for(var k in o) return false 7 | return true 8 | } 9 | 10 | function test (N, M) { 11 | var g = {} 12 | 13 | for(var i = 0; i < N; i++) { 14 | g[i] = g[i] || {} 15 | for(var k = 0; k + Math.random() < M; k++) { 16 | var j = ~~(Math.random()*N) 17 | g[j] = g[j] || {} 18 | g[i][j] = g[j][i] = true 19 | } 20 | } 21 | 22 | var reachable = {} 23 | var next = {}, hops = 0, connected = 1 24 | 25 | reachable[0] = next[0] = true 26 | 27 | while(!isEmpty(next)) { 28 | hops ++ 29 | for(var k in next) { 30 | for(var j in g[k]) 31 | if(!reachable[j]) { 32 | connected ++ 33 | reachable[j] = true 34 | next[j] = true 35 | } 36 | delete next[k] 37 | } 38 | } 39 | return { connected: connected === N, reachable: connected, hops: hops } 40 | } 41 | 42 | console.log('P(edge), P(connected), average, stdev') 43 | 44 | ;[1.0,1.1,1.2,1.3,1.4,1.5,1.6, 1.7,1.8,1.9,2,3,5,10, 20].forEach(function (m) { 45 | var prob = 0, dist = stats.initial() 46 | var c = 0 47 | for(var i = 0; i < 100;i++) { 48 | var data = test(N, m) 49 | dist = stats(dist, data.hops) 50 | if(data.connected) 51 | c++ 52 | } 53 | console.log([m, c/100, dist.mean, dist.stdev].join(', ')) 54 | }) 55 | 56 | -------------------------------------------------------------------------------- /simulation/flood.js: -------------------------------------------------------------------------------- 1 | 2 | 3 | console.log('round, dR, dT') 4 | var n = {}, C = 1, N = 10000 5 | 6 | for (var i = 0; i < N; i++) 7 | n [i] = false 8 | 9 | n[0] = true 10 | var k = 1 11 | while(C < N) { 12 | var m = 0, nn = {} 13 | for(var i = 0; i < N; i++) { 14 | var j = ~~(Math.random()*N) 15 | if(n[i] != n[j]) { 16 | m++ 17 | n[i] = n[j] = true 18 | //nn[i] = nn[j] = true 19 | C++ 20 | } 21 | } 22 | // for(var K in nn) 23 | // n[K] = nn[K] 24 | console.log([k++, m, C].join(', ')) 25 | } 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /simulation/friends.js: -------------------------------------------------------------------------------- 1 | 2 | var B = 0.8, 3 | -------------------------------------------------------------------------------- /simulation/spanning.js: -------------------------------------------------------------------------------- 1 | function toLetter (i) { 2 | return String.fromCodePoint('A'.codePointAt(0) + i) 3 | } 4 | 5 | var N = 20, k = 3 6 | var g = {} 7 | var hops = {} 8 | hops.A = 0 9 | g.A = {} 10 | for(var i = 1; i < N; i++) { 11 | g[toLetter(i)] = g[toLetter(i)] || {} 12 | var j = ~~(Math.random()*i) 13 | console.log('->', toLetter(j), toLetter(i), g) 14 | g[toLetter(j)][toLetter(i)] = 1 15 | } 16 | 17 | for(var i = 0; i < N; i++) { 18 | for(var ii = 1; ii +Math.random() < k; ii++) { 19 | var j = ~~(Math.random()*N) 20 | g[toLetter(i)][toLetter(j)] = 1 21 | } 22 | } 23 | 24 | function isEmpty (e) { 25 | for(var k in e) return false 26 | return true 27 | } 28 | 29 | function spanning (g) { 30 | var next = {} 31 | var reachable = {} 32 | var s = {} 33 | next['A'] = true 34 | reachable['A'] = 0 35 | while(!isEmpty(next)) { 36 | for(var k in next) { 37 | for(var j in g[k]) { 38 | if(reachable[j] == undefined) { 39 | s[k] = s[k] || {} 40 | s[k][j] = reachable[k] + 1 41 | reachable[j] = reachable[k] + 1 42 | next[j] = true 43 | } 44 | } 45 | delete next[k] 46 | } 47 | } 48 | 49 | var hops = {} 50 | // console.log('S', s, '...', g) 51 | return {hops: reachable, spanning: s} 52 | } 53 | 54 | 55 | console.log(g) 56 | 57 | function remap (g) { 58 | var s = spanning(g) 59 | var remap = {}, i = 0 60 | for(var k in s.hops) 61 | remap[k] = toLetter(i++) 62 | console.log(s.spanning) 63 | console.log(s.hops) 64 | console.log(remap) 65 | var _g = {} 66 | for(var j in g) 67 | for(var k in g[j]) { 68 | _g[remap[j]] = _g[remap[j]] || {} 69 | _g[remap[j]][remap[k]] = g[j][k] 70 | } 71 | console.log("G", _g) 72 | return spanning(_g) 73 | } 74 | 75 | console.log(remap(g).spanning) 76 | 77 | --------------------------------------------------------------------------------