├── .gitignore ├── .travis.yml ├── README.md ├── erlang └── model │ ├── .gitignore │ ├── README.md │ ├── qsc.erl │ └── run.sh ├── go ├── dist │ ├── README.md │ ├── causal.go │ ├── dist_test.go │ ├── doc.go │ ├── node.go │ ├── qsc.go │ ├── set.go │ ├── tlc.go │ └── vec.go ├── lib │ ├── backoff │ │ ├── retry.go │ │ ├── retry_test.go │ │ └── rfq │ │ │ └── doc.go │ ├── cas │ │ ├── cas.go │ │ └── test │ │ │ ├── cas.go │ │ │ └── cas_test.go │ ├── doc.go │ └── fs │ │ ├── atomic │ │ ├── atomic.go │ │ └── atomic_test.go │ │ ├── casdir │ │ └── state.go │ │ └── verst │ │ └── state.go └── model │ ├── README.md │ ├── doc.go │ ├── model_test.go │ ├── node.go │ ├── qsc.go │ ├── qscod │ ├── README.md │ ├── core │ │ ├── cli.go │ │ └── test │ │ │ ├── cli.go │ │ │ └── cli_test.go │ ├── encoding │ │ └── enc.go │ ├── fs │ │ ├── casdir │ │ │ └── cas_test.go │ │ ├── simple │ │ │ ├── store.go │ │ │ └── store_test.go │ │ └── store │ │ │ ├── store.go │ │ │ └── store_test.go │ └── qscas │ │ ├── doc.go │ │ ├── group.go │ │ ├── group_test.go │ │ ├── rand.go │ │ └── store.go │ ├── quepaxa │ ├── consensus.go │ ├── isr.go │ └── proposal.go │ └── tlc.go ├── spin ├── README.md ├── qp.pml ├── qpm.pml ├── qsc.pml ├── results-qp.txt ├── results-qpm.txt └── run.sh └── tools └── qsc ├── .gitignore ├── group.go ├── main.go └── string.go /.gitignore: -------------------------------------------------------------------------------- 1 | pan 2 | pan.* 3 | *.pml.trail 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: bionic 2 | 3 | language: go 4 | 5 | go: 6 | - 1.12.x 7 | 8 | before_install: 9 | - sudo apt-get install -y spin 10 | 11 | install: 12 | - go get -u golang.org/x/lint/golint 13 | 14 | script: 15 | - go vet ./... 16 | - if [ "$( gofmt -l . )" ]; then gofmt -d; exit 1; fi 17 | - golint -set_exit_status ./... 18 | - go test ./... 19 | - cd spin; ./run.sh 20 | 21 | notifications: 22 | email: false 23 | 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | This repository contains multiple prototype implementations of 3 | Threshold Logical Clocks (TLC) and Que Sera Consensus (QSC), 4 | as described in the following papers: 5 | 6 | * [Threshold Logical Clocks for Asynchronous Distributed Coordination and Consensus](https://arxiv.org/abs/1907.07010) 7 | * [Que Sera Consensus: Simple Asynchronous Agreement with Private Coins and Threshold Logical Clocks](https://arxiv.org/abs/2003.02291) 8 | 9 | The following prototype implementations of TLC and QSC are available 10 | in multiple languages: 11 | 12 | * [erlang/model](erlang/model/) contains a minimalistic model implementation 13 | of the QSC, TLCB, and TLCR algorithms detailed in the 14 | [new QSC preprint](https://arxiv.org/abs/2003.02291). 15 | This model implements QSC using Erlang processes and communication 16 | on a single machine for illustrative simplicity, although 17 | [distributed Erlang](https://erlang.org/doc/reference_manual/distributed.html) 18 | should make it straightforward to extend this model 19 | to true distributed consensus. 20 | Erlang's [selective receive](https://ndpar.blogspot.com/2010/11/erlang-explained-selective-receive.html) 21 | is particularly well-suited to implementing TLCR concisely. 22 | The model consists of only 73 lines of code 23 | as measured by [cloc](https://github.com/AlDanial/cloc), 24 | including test code, 25 | or only 37 lines comprising the consensus algorithm alone. 26 | 27 | * [go/model](go/model/) contains a minimalistic model implementation in Go 28 | of TLC and QSC as described in the 29 | [original TLC preprint](https://arxiv.org/abs/1907.07010). 30 | This model illustrates the key concepts 31 | using goroutines and shared memory communication for simplicity. 32 | It is not useful in an actual distributed context, 33 | but being less than 200 code lines long 34 | as measured by [cloc](https://github.com/AlDanial/cloc), 35 | it is ideal for studying and understanding TLC and QSC. 36 | 37 | * [go/model/qscod](go/model/qscod/) 38 | contains a model implementation in Go of QSCOD, 39 | the client-driven "on-demand" consensus algorithm outlined in the 40 | [new QSC preprint](https://arxiv.org/abs/2003.02291). 41 | This formulation of QSC consumes no bandwidth or computation 42 | when there is no work to be done (hence on-demand), 43 | and incurs only O(n2) communication complexity 44 | per client-driven agreement. 45 | 46 | * [go/dist](go/dist/) contains a simple but working 47 | "real" distributed implementation of TLC and QSC in Go 48 | for a fail-stop (Paxos-like) threat model. 49 | It uses TCP, TLS encryption and authentication, 50 | and Go's native Gob encoding for inter-node communication. 51 | At less than 1000 code lines long 52 | as measured by [cloc](https://github.com/AlDanial/cloc), 53 | it is still probably one of the simplest implementations 54 | of asynchronous consensus around. 55 | 56 | * [spin](spin/) contains a simple Promela model of the core of TLC and QSC 57 | for the [Spin model checker](https://spinroot.com/spin/whatispin.html). 58 | Although this implementation models TLC and QSC only at a 59 | very high, abstract level, it captures the basic logic enough 60 | to lend confidence to the correctness of the algorithm. 61 | 62 | All of this code is still extremely early and experimental; 63 | use at your own risk. 64 | 65 | [![Build Status](https://travis-ci.com/dedis/tlc.svg?branch=master)](https://travis-ci.com/dedis/tlc) 66 | 67 | -------------------------------------------------------------------------------- /erlang/model/.gitignore: -------------------------------------------------------------------------------- 1 | *.beam 2 | -------------------------------------------------------------------------------- /erlang/model/README.md: -------------------------------------------------------------------------------- 1 | This directory contains a minimal implementation of 2 | Que Sera Consensus (QSC) built on Threshold Logical Clocks (TLC) 3 | in [Erlang](https://www.erlang.org) 4 | for fail-stop, non-Byzantine environments. 5 | This model implements the QSC, TLCB, and TLCR algorithms detailed in the 6 | [new QSC preprint](https://arxiv.org/abs/2003.02291) 7 | in only 37 lines of code representing the actual algorithm. 8 | 9 | For background information on QSC and TLC, 10 | and other model implementations in several languages, please see the 11 | [top level of this repository](https://github.com/dedis/tlc/). 12 | -------------------------------------------------------------------------------- /erlang/model/qsc.erl: -------------------------------------------------------------------------------- 1 | -module(qsc). 2 | -export([qsc/1, test/0]). 3 | 4 | % Node configuration is a tuple defined as a record. 5 | % nn: node number from 1..len(pids) 6 | % tr: receive threshold 7 | % tb: broadcast threshold 8 | % ts: spread threshold 9 | % pids: list of process IDs of all nodes 10 | % steps: maximum number of time steps to run, nil to run forever 11 | % choose: function choose(Config, Step) -> Msg to choose application message 12 | % random: function random() -> Value to choose a random priority value 13 | % deliver: function deliver(History) to deliver a committed history 14 | -record(config, {nn, tr, tb, ts, pids, steps, choose, random, deliver}). 15 | 16 | % A history is a record representing the most recent in a chain. 17 | -record(hist, {step, nn, msg, pri, pred}). 18 | 19 | % qsc(C) -> (never returns) 20 | % Implements Que Sera Co nsensus (QSC) atop TLCB and TLCR. 21 | qsc(C) -> qsc(C, 1, #hist{step=0}). % start at step 1 with placeholder pred 22 | qsc(#config{steps=Max}, S0, _) when S0 > Max -> {}; % stop after Max steps 23 | qsc(#config{nn=I, choose=Ch, random=Rv, deliver=D} = C, S0, H0) -> 24 | H1 = #hist{step=S0, nn=I, msg=Ch(C, S0), pri=Rv(), pred=H0}, 25 | {S1, R1, B1} = tlcb(C, S0, H1), % Try to broadcast (confirm) proposal 26 | {H2, _} = best(B1), % Choose some best eligible proposal 27 | {S2, R2, B2} = tlcb(C, S1, H2), % Re-broadcast it to reconfirm proposal 28 | {Hn, _} = best(R2), % Choose best eligible for next round 29 | {H3, Unique} = best(R1), % What is the best potential history? 30 | Final = lists:member(Hn, B2) and (Hn == H3) and Unique, 31 | if Final -> D(Hn), qsc(C, S2, Hn); % Deliver history Hn 32 | true -> qsc(C, S2, Hn) % Just proceed to next consensus round 33 | end. 34 | 35 | % best(L) -> {B, U} 36 | % Find and return the best (highest-priority) history B in a nonempty list L, 37 | % and a flag U indicating whether B is uniquely best (highest priority) in L. 38 | best([H]) -> {H, true}; % trivial singleton case 39 | best(L) -> 40 | Compare = fun(#hist{pri=AR}, #hist{pri=BR}) -> AR >= BR end, 41 | [#hist{pri=BR} = B, #hist{pri=NR} | _] = lists:sort(Compare, L), 42 | {B, (BR /= NR)}. 43 | 44 | 45 | % tlcb(C, S, H) -> {S, R, B} 46 | % Implements the TLCB algorithm for full-spread synchronous broadcast. 47 | tlcb(#config{ts=Ts} = C, S0, H) -> 48 | {S1, R1, _} = tlcr(C, S0, H), % Step 1: broadcast history H 49 | {S2, R2, _} = tlcr(C, S1, R1), % Step 2: re-broadcast list we received 50 | R = sets:to_list(sets:union([sets:from_list(L) || L <- [R1 | R2]])), 51 | B = [Hc || Hc <- R, count(R2, Hc) >= Ts], 52 | {S2, R, B}. % New state, receive and broadcast sets 53 | 54 | % count(LL, H) -> N 55 | % Return N the number of lists in list-of-lists LL that include history H. 56 | count(LL, H) -> length([L || L <- LL, lists:member(H, L)]). 57 | 58 | 59 | % tlcr(C, S, M) -> {S, R, nil} 60 | % Implements the TLCR algorithm for receive-threshold synchronous broadcast. 61 | tlcr(#config{pids=Pids} = C, S, M) -> 62 | [P ! {S, M} || P <- Pids], % broadcast next message 63 | tlcr_wait(C, S, []). % wait for receive threshold 64 | tlcr_wait(#config{tr=Tr} = C, S, R) when length(R) < Tr -> 65 | receive {RS, RM} when RS == S -> tlcr_wait(C, S, [RM | R]); 66 | {RS, _} when RS < S -> tlcr_wait(C, S, R) % drop old msgs 67 | end; % when RS > S message stays in the inbox to be received later 68 | tlcr_wait(_, S, R) -> {S+1, R, nil}. 69 | 70 | 71 | % Run a test-case configured for a given number of potentially-failing nodes F, 72 | % then signal Parent process when done. 73 | test_run(F, Steps) -> 74 | % Generate a standard valid configuration from number of failures F. 75 | N = 3*F, Tr = 2*F, Tb = F, Ts = F+1, 76 | io:fwrite("Test N=~p F=~p~n", [N, F]), 77 | 78 | % Function to choose message for node I to propose at TLC time-step S. 79 | Choose = fun(#config{nn=I}, S) -> {msg, S, I} end, 80 | 81 | % Choose a random value to attach to a proposal in time-step S. 82 | % This low-entropy random distribution is intended only for testing, 83 | % so as to ensure a significant rate of ties for best priority. 84 | % Production code should use high-entropy cryptographic randomness for 85 | % maximum efficiency and strength against intelligent DoS attackers. 86 | Random = fun() -> rand:uniform(N) end, 87 | 88 | % Spawn a process to receive and consistency-check committed histories. 89 | Checker = spawn(fun() -> test_checker(#hist{step=0}) end), 90 | 91 | % The nodes will "deliver" histories by sending them back to us. 92 | Deliver = fun(H) -> Checker ! {check, H} end, 93 | 94 | % Launch a process representing each of the N nodes. 95 | Self = self(), 96 | Pids = [spawn(fun() -> test_node(Self) end) || _ <- lists:seq(1, N)], 97 | 98 | % Send each node its complete configuration record to get it started. 99 | C = #config{ tr = Tr, tb = Tb, ts = Ts, pids = Pids, steps = Steps, 100 | choose = Choose, random = Random, deliver = Deliver}, 101 | [lists:nth(I, Pids) ! C#config{nn=I} || I <- lists:seq(1, N)], 102 | 103 | % Wait until all nodes run the designated number of time steps. 104 | [test_wait(I) || I <- lists:seq(1, N)], 105 | Checker ! {stop}. % Terminate our checker process 106 | 107 | % Receive a node configuration, run a QSC node simulation with it, 108 | % then send a completion signal to our parent process. 109 | test_node(Parent) -> 110 | receive #config{} = C -> qsc(C), Parent ! {done, C#config.nn} end. 111 | 112 | % Wait to receive a signal that node I is finished. 113 | test_wait(I) -> receive {done, I} -> {} end. 114 | 115 | % test_checker() -> {} 116 | % Receive committed histories from all nodes and consistency-check them 117 | test_checker(Hp) -> 118 | receive {check, H} -> 119 | %io:fwrite("committed ~P~n", [H, 8]), 120 | test_checker(test_check(Hp, H)); 121 | {stop} -> {} 122 | end. 123 | 124 | % test_check(A, B) -> H 125 | % Check two histories A and B for consistency, and return the longer one. 126 | test_check(#hist{step=AC,pred=AP} = A, #hist{step=BC} = B) when AC > BC -> 127 | test_check(AP, B), A; % compare shorter prefix of A with B 128 | test_check(#hist{step=AC} = A, #hist{step=BC,pred=BP} = B) when BC > AC -> 129 | test_check(A, BP), B; % compare A with shorter prefix of B 130 | test_check(A, B) when A == B -> A; 131 | test_check(A, B) -> erlang:error({inconsistency, A, B}). 132 | 133 | % Run QSC and TLC through a test suite. 134 | test() -> 135 | [test_run(F, 1000) || F <- [1,2,3,4,5]], % simple test suite 136 | io:fwrite("Tests completed~n"). 137 | 138 | -------------------------------------------------------------------------------- /erlang/model/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | erl -make && erl -noshell -run qsc test -run init stop 3 | -------------------------------------------------------------------------------- /go/dist/README.md: -------------------------------------------------------------------------------- 1 | This Go package provides a simple but "real" distributed implementation of 2 | Que Sera Consensus (QSC) built on Threshold Logical Clocks (TLC) 3 | for fail-stop, non-Byzantine environments. 4 | For background information on QSC and TLC, 5 | and other model implementations in several languages, please see the 6 | [top level of this repository](https://github.com/dedis/tlc/). 7 | For more details on this package see the code and its 8 | [GoDoc documentation](https://godoc.org/github.com/dedis/tlc/go/dist). 9 | -------------------------------------------------------------------------------- /go/dist/causal.go: -------------------------------------------------------------------------------- 1 | package dist 2 | 3 | // Broadcast a copy of our current message template to all nodes. 4 | func (n *Node) broadcastCausal(msg *Message) { 5 | 6 | //println(n.self, n.tmpl.Step, "broadcastCausal", 7 | // "mat", len(n.mat)) 8 | 9 | // Assign the new message a sequence number 10 | msg.Seq = len(n.seqLog[n.self]) // Assign sequence number 11 | msg.Vec = n.mat[n.self].copy() // Include vector time update 12 | n.logCausal(n.self, msg) // Add msg to our log 13 | //println(n.self, n.tmpl.Step, "broadcastCausal step", msg.Step, 14 | // "typ", msg.Typ, "seq", msg.Seq, 15 | // "vec", fmt.Sprintf("%v", msg.Vec)) 16 | 17 | // We always receive our own message first. 18 | n.receiveTLC(msg) 19 | 20 | // Send it to all other peers. 21 | for dest := range n.peer { 22 | if dest != n.self { 23 | n.sendCausal(dest, msg) 24 | } 25 | } 26 | } 27 | 28 | // Log a peer's message, either our own (just sent) 29 | // or another node's (received and ready to be delivered). 30 | func (n *Node) logCausal(peer int, msg *Message) { 31 | 32 | // Update peer's matrix clock and our record of what it saw by msg 33 | for i := range n.peer { 34 | //println(i, "mat", len(n.mat), "vec", len(msg.Vec)) 35 | for n.mat[peer][i] < msg.Vec[i] { 36 | n.sawCausal(peer, n.seqLog[i][n.mat[peer][i]]) 37 | n.mat[peer][i]++ 38 | } 39 | } 40 | n.sawCausal(peer, msg) // msg has been seen by the peer that sent it 41 | n.sawCausal(n.self, msg) // and now we've seen the message too 42 | 43 | n.seqLog[peer] = append(n.seqLog[peer], msg) // log this msg 44 | n.mat[n.self][peer] = len(n.seqLog[peer]) // update our vector time 45 | if len(n.seqLog[peer]) != msg.Seq+1 { // sanity check 46 | panic("out of sync") 47 | } 48 | } 49 | 50 | // Record the fact that a given peer is now known to have seen a given message. 51 | // For Wit messages, record the fact that the proposal was threshold witnessed. 52 | func (n *Node) sawCausal(peer int, msg *Message) { 53 | n.saw[peer].add(msg) 54 | if msg.Typ == Wit { 55 | prop := n.seqLog[msg.From][msg.Prop] 56 | if prop.Typ != Prop { 57 | panic("not a proposal!") 58 | } 59 | n.wit[peer].add(prop) 60 | } 61 | } 62 | 63 | // Transmit a message to a particular node. 64 | func (n *Node) sendCausal(dest int, msg *Message) { 65 | //println(n.self, n.tmpl.Step, "sendCausal to", dest, "typ", msg.Typ, 66 | // "seq", msg.Seq) 67 | n.peer[dest].Send(msg) 68 | } 69 | 70 | // Receive a possibly out-of-order message from the network. 71 | // Enqueue it and actually deliver messages as soon as we can. 72 | func (n *Node) receiveCausal(msg *Message) { 73 | 74 | // Unicast acknowledgments don't get sequence numbers or reordering. 75 | if msg.Typ == Ack { 76 | n.receiveTLC(msg) // Just send it up the stack 77 | return 78 | } 79 | 80 | // Ignore duplicate message deliveries 81 | if msg.Seq < n.mat[n.self][msg.From] { 82 | println(n.self, n.tmpl.Step, "duplicate message from", msg.From, 83 | "seq", msg.Seq) 84 | panic("XXX") 85 | } 86 | 87 | // Enqueue broadcast message for delivery in causal order. 88 | //println(n.self, n.tmpl.Step, "receiveCausal from", msg.From, 89 | // "type", msg.Typ, "seq", msg.Seq, 90 | // "vec", fmt.Sprintf("%v", msg.Vec)) 91 | //if len(n.oom[msg.From]) <= msg.Seq - n.mat[n.self][msg.From] - 1000 { 92 | // panic("huge jump") 93 | //} 94 | for len(n.oom[msg.From]) <= msg.Seq-n.mat[n.self][msg.From] { 95 | n.oom[msg.From] = append(n.oom[msg.From], nil) 96 | } 97 | n.oom[msg.From][msg.Seq-n.mat[n.self][msg.From]] = msg 98 | 99 | // Deliver whatever messages we can consistently with causal order. 100 | for progress := true; progress; { 101 | progress = false 102 | for i := range n.peer { 103 | progress = progress || n.deliverCausal(i) 104 | } 105 | } 106 | } 107 | 108 | // Try to deliver out-of-order messages held from a given peer. 109 | // Returns true if we made progress, false if nothing to do for this peer. 110 | func (n *Node) deliverCausal(peer int) bool { 111 | if len(n.oom[peer]) == 0 || n.oom[peer][0] == nil || 112 | !n.oom[peer][0].Vec.le(n.mat[n.self]) { 113 | return false 114 | } 115 | 116 | // Log the message now that it's in causal order. 117 | //println(n.self, n.tmpl.Step, "enqueueCausal", 118 | // "deliver type", msg.Typ, 119 | // "seq", msg.Seq, "#oom", len(n.oom[i])) 120 | msg := n.oom[peer][0] 121 | n.logCausal(peer, msg) 122 | 123 | // Remove it from this peer's out-of-order message queue. 124 | n.oom[peer] = n.oom[peer][1:] 125 | 126 | // Deliver the message to upper layers. 127 | n.receiveTLC(msg) 128 | 129 | return true // made progress 130 | } 131 | 132 | // Initialize the causality and higher layer state for a node. 133 | func (n *Node) initCausal() { 134 | n.mat = make([]vec, len(n.peer)) 135 | n.oom = make([][]*Message, len(n.peer)) 136 | n.seqLog = make([][]*Message, len(n.peer)) 137 | n.saw = make([]set, len(n.peer)) 138 | n.wit = make([]set, len(n.peer)) 139 | for i := range n.peer { 140 | n.mat[i] = make(vec, len(n.peer)) 141 | n.saw[i] = make(set) 142 | n.wit[i] = make(set) 143 | } 144 | 145 | n.initTLC() 146 | } 147 | -------------------------------------------------------------------------------- /go/dist/dist_test.go: -------------------------------------------------------------------------------- 1 | package dist 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "crypto/ecdsa" 7 | "crypto/elliptic" 8 | crand "crypto/rand" 9 | "crypto/tls" 10 | "crypto/x509" 11 | "encoding/gob" 12 | "encoding/json" 13 | "encoding/pem" 14 | "fmt" 15 | "io" 16 | "math/big" 17 | mrand "math/rand" 18 | "net" 19 | "os" 20 | "os/exec" 21 | "sync" 22 | "testing" 23 | "time" 24 | ) 25 | 26 | // MaxSteps to take 27 | var MaxSteps int 28 | 29 | // Maximum random delays to add to message deliveries for testing 30 | var MaxSleep time.Duration 31 | 32 | // Whether to run consensus among multiple separate processes 33 | var MultiProcess = true 34 | 35 | // Whether to use TLS encryption and authentication atop TCP 36 | var UseTLS = true 37 | 38 | // Information about each virtual host passed to child processes via JSON 39 | type testHost struct { 40 | Name string // Virtual host name 41 | Addr string // Host IP address and TCP port 42 | Cert []byte // Host's self-signed x509 certificate 43 | } 44 | 45 | // Configuration information each child goroutine or process needs to launch 46 | type testConfig struct { 47 | Self int // Which participant number we are 48 | Nnodes int // Total number of participants 49 | HostName string // This child's virtual hostname 50 | 51 | MaxSteps int 52 | MaxTicket int32 53 | MaxSleep time.Duration 54 | } 55 | 56 | func TestQSC(t *testing.T) { 57 | 58 | testCase(t, 1, 1, 10000, 0, 0) // Trivial case: 1 of 1 consensus! 59 | testCase(t, 2, 2, 10000, 0, 0) // Another trivial case: 2 of 2 60 | 61 | testCase(t, 2, 3, 1000, 0, 0) // Standard f=1 case 62 | testCase(t, 3, 5, 1000, 0, 0) // Standard f=2 case 63 | testCase(t, 4, 7, 100, 0, 0) // Standard f=3 case 64 | testCase(t, 5, 9, 100, 0, 0) // Standard f=4 case 65 | testCase(t, 11, 21, 20, 0, 0) // Standard f=10 case 66 | //testCase(t, 101, 201, 10, 0, 0) // Standard f=100 case - blows up 67 | 68 | testCase(t, 3, 3, 100, 0, 0) // Larger-than-minimum thresholds 69 | testCase(t, 6, 7, 100, 0, 0) 70 | testCase(t, 9, 10, 100, 0, 0) 71 | 72 | // Test with low-entropy tickets: 73 | // commit success rate will be bad, but still must remain safe! 74 | testCase(t, 2, 3, 10, 1, 0) // Limit case: will never commit 75 | testCase(t, 2, 3, 100, 2, 0) // Extreme low-entropy: rarely commits 76 | testCase(t, 2, 3, 100, 3, 0) // A bit better bit still bad... 77 | 78 | // Test with random delays inserted 79 | testCase(t, 2, 3, 100, 0, 1*time.Nanosecond) 80 | testCase(t, 2, 3, 100, 0, 1*time.Microsecond) 81 | testCase(t, 2, 3, 100, 0, 1*time.Millisecond) 82 | testCase(t, 4, 7, 100, 0, 1*time.Microsecond) 83 | testCase(t, 4, 7, 100, 0, 1*time.Millisecond) 84 | } 85 | 86 | func testCase(t *testing.T, threshold, nnodes, maxSteps, maxTicket int, 87 | maxSleep time.Duration) { 88 | 89 | if maxTicket == 0 { // Default to moderate-entropy tickets 90 | maxTicket = 10 * nnodes 91 | } 92 | 93 | desc := fmt.Sprintf("T=%v,N=%v,Steps=%v,Tickets=%v,Sleep=%v", 94 | threshold, nnodes, maxSteps, maxTicket, maxSleep) 95 | t.Run(desc, func(t *testing.T) { 96 | 97 | // Configure and run the test case. 98 | MaxSteps = maxSteps 99 | MaxTicket = int32(maxTicket) 100 | MaxSleep = maxSleep 101 | Threshold = threshold 102 | 103 | testExec(t, threshold, nnodes) 104 | }) 105 | } 106 | 107 | func testExec(t *testing.T, threshold, nnodes int) { 108 | 109 | // Create a cancelable context in which to execute helper processes 110 | ctx, cancel := context.WithCancel(context.Background()) 111 | defer cancel() // kill child processes 112 | 113 | // Create a public/private keypair and self-signed cert for each node. 114 | conf := make([]testConfig, nnodes) // each node's config information 115 | for i := range conf { 116 | conf[i].Self = i 117 | conf[i].Nnodes = nnodes 118 | conf[i].HostName = fmt.Sprintf("host%v", i) 119 | conf[i].MaxSteps = MaxSteps 120 | conf[i].MaxTicket = MaxTicket 121 | conf[i].MaxSleep = MaxSleep 122 | } 123 | 124 | // Start the per-node child processes, 125 | // and gather network addresses and certificates from each one. 126 | childGroup := &sync.WaitGroup{} 127 | host := make([]testHost, nnodes) 128 | enc := make([]*json.Encoder, nnodes) 129 | dec := make([]*json.Decoder, nnodes) 130 | for i := range host { 131 | 132 | childGroup.Add(1) 133 | childIn, childOut := testExecChild(ctx, &conf[i], t, childGroup) 134 | 135 | // We'll communicate with the child via JSON-encoded stdin/out 136 | enc[i] = json.NewEncoder(childIn) 137 | dec[i] = json.NewDecoder(childOut) 138 | 139 | // Send the child its configuration information 140 | if err := enc[i].Encode(&conf[i]); err != nil { 141 | t.Fatalf("Encode: " + err.Error()) 142 | } 143 | 144 | // Get the network address the child is listening on 145 | if err := dec[i].Decode(&host[i]); err != nil { 146 | t.Fatalf("Decode: %v", err.Error()) 147 | } 148 | if host[i].Name != conf[i].HostName { // sanity check 149 | panic("hostname mismatch") 150 | } 151 | //println("child", i, "listening on", host[i].Addr) 152 | } 153 | 154 | // Send the array of addresses to all the child processes 155 | for i := range host { 156 | if err := enc[i].Encode(host); err != nil { 157 | t.Fatalf("Encode: " + err.Error()) 158 | } 159 | } 160 | 161 | // Wait and collect the consensus histories of each child 162 | hist := make([][]choice, nnodes) 163 | for i := range host { 164 | if err := dec[i].Decode(&hist[i]); err != nil { 165 | t.Fatalf("Decode: %v", err.Error()) 166 | } 167 | } 168 | 169 | // Let all the children know they can exit 170 | for i := range host { 171 | if err := enc[i].Encode(struct{}{}); err != nil { 172 | t.Fatalf("Encode: " + err.Error()) 173 | } 174 | } 175 | 176 | // Wait for the helper processes to complete 177 | childGroup.Wait() 178 | } 179 | 180 | // Exec a child as a separate process. 181 | func testExecChild(ctx context.Context, conf *testConfig, t *testing.T, 182 | grp *sync.WaitGroup) (io.Writer, io.Reader) { 183 | 184 | if !MultiProcess { 185 | // Run a child as a separate goroutine in the same process. 186 | childInRd, childInWr := io.Pipe() 187 | childOutRd, childOutWr := io.Pipe() 188 | go func() { 189 | testChild(childInRd, childOutWr) 190 | grp.Done() 191 | }() 192 | return childInWr, childOutRd 193 | } 194 | 195 | // Run the child as a separate helper process 196 | cmd := exec.CommandContext(ctx, os.Args[0], 197 | "-test.run=TestHelper") 198 | cmd.Env = append(os.Environ(), "TLC_HELPER=1") 199 | 200 | // Arrange to send standard input to the child via pipe 201 | childIn, err := cmd.StdinPipe() 202 | if err != nil { 203 | t.Fatalf("StdinPipe: %v", err.Error()) 204 | } 205 | 206 | // Copy child's standard output to parent via pipe 207 | childOut, err := cmd.StdoutPipe() 208 | if err != nil { 209 | t.Fatalf("StdoutPipe: %v", err.Error()) 210 | } 211 | 212 | // Copy child's standard error to parent's standard error 213 | childErr, err := cmd.StderrPipe() 214 | if err != nil { 215 | t.Fatalf("StderrPipe: %v", err.Error()) 216 | } 217 | go copyAll(os.Stderr, childErr) 218 | 219 | // Start the command running 220 | if err := cmd.Start(); err != nil { 221 | t.Fatalf("cmd.Start: %v", err.Error()) 222 | } 223 | 224 | // Arrange to signal the provided WaitGroup when child terminates 225 | go func() { 226 | if err := cmd.Wait(); err != nil { 227 | t.Fatalf("cmd.Wait: %v", err.Error()) 228 | } 229 | grp.Done() 230 | }() 231 | 232 | return childIn, childOut 233 | } 234 | 235 | func TestHelper(t *testing.T) { 236 | 237 | if os.Getenv("TLC_HELPER") == "" { 238 | return // Do nothing except when called as a helper 239 | } 240 | 241 | // Exit with error status if anything goes wrong. 242 | defer os.Exit(1) 243 | 244 | testChild(os.Stdin, os.Stdout) 245 | os.Exit(0) 246 | } 247 | 248 | func copyAll(dst io.Writer, src io.Reader) { 249 | if _, err := io.Copy(dst, src); err != nil { 250 | println("Copy: " + err.Error()) 251 | } 252 | } 253 | 254 | func createCert(hostName string) (certPemBytes, privPemBytes []byte) { 255 | 256 | priv, err := ecdsa.GenerateKey(elliptic.P256(), crand.Reader) 257 | if err != nil { 258 | panic("createCert: " + err.Error()) 259 | } 260 | 261 | notBefore := time.Now() // valid starting now 262 | notAfter := notBefore.Add(365 * 24 * time.Hour) // valid for a year 263 | tmpl := x509.Certificate{ 264 | NotBefore: notBefore, 265 | NotAfter: notAfter, 266 | IsCA: true, 267 | KeyUsage: x509.KeyUsageKeyEncipherment | 268 | x509.KeyUsageDigitalSignature | 269 | x509.KeyUsageCertSign, 270 | ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth, 271 | x509.ExtKeyUsageClientAuth}, 272 | BasicConstraintsValid: true, 273 | DNSNames: []string{hostName}, 274 | SerialNumber: big.NewInt(1), 275 | } 276 | certb, err := x509.CreateCertificate(crand.Reader, &tmpl, &tmpl, 277 | &priv.PublicKey, priv) 278 | if err != nil { 279 | panic("createCert: " + err.Error()) 280 | } 281 | 282 | cert, err := x509.ParseCertificate(certb) 283 | if err != nil { 284 | panic("ParseCertificate: " + err.Error()) 285 | } 286 | 287 | if err := cert.VerifyHostname(hostName); err != nil { 288 | panic("VerifyHostname: " + err.Error()) 289 | } 290 | 291 | // Sanity-check the certificate just to make sure it actually works. 292 | pool := x509.NewCertPool() 293 | pool.AddCert(cert) 294 | vo := x509.VerifyOptions{DNSName: hostName, Roots: pool} 295 | if _, err := cert.Verify(vo); err != nil { 296 | panic("Verify: " + err.Error()) 297 | } 298 | //println("verified for", hostName) 299 | 300 | // PEM-encode our certificate 301 | certPem := bytes.NewBuffer(nil) 302 | if err := pem.Encode(certPem, &pem.Block{Type: "CERTIFICATE", 303 | Bytes: certb}); err != nil { 304 | panic("pem.Encode: " + err.Error()) 305 | } 306 | 307 | // PEM-encode our private key 308 | privb, err := x509.MarshalECPrivateKey(priv) 309 | if err != nil { 310 | panic("x509.MarshalECPrivateKey: " + err.Error()) 311 | } 312 | privPem := bytes.NewBuffer(nil) 313 | if err := pem.Encode(privPem, &pem.Block{Type: "EC PRIVATE KEY", 314 | Bytes: privb}); err != nil { 315 | panic("pem.Encode: " + err.Error()) 316 | } 317 | 318 | return certPem.Bytes(), privPem.Bytes() 319 | } 320 | 321 | func testChild(in io.Reader, out io.Writer) { 322 | 323 | // We'll use JSON over stdin/stdout to coordinate with our parent. 324 | dec := json.NewDecoder(in) 325 | enc := json.NewEncoder(out) 326 | 327 | // Get the child process config information via JSON 328 | conf := testConfig{} 329 | if err := dec.Decode(&conf); err != nil { 330 | panic("Decode: " + err.Error()) 331 | } 332 | self := conf.Self 333 | MaxSteps = conf.MaxSteps 334 | MaxTicket = conf.MaxTicket 335 | MaxSleep = conf.MaxSleep 336 | 337 | // Initialize the node appropriately 338 | //println("self", self, "nnodes", conf.Nnodes) 339 | n := &Node{} 340 | n.init(self, make([]peer, conf.Nnodes)) 341 | n.mutex.Lock() // keep node's TLC state locked until fully set up 342 | 343 | // Create a TLS/TCP listen socket for this child 344 | tcpl, err := net.Listen("tcp", "") 345 | if err != nil { 346 | panic("Listen: " + err.Error()) 347 | } 348 | 349 | // Create an x509 certificate and private key for this child 350 | //println(self, "createCert for", conf.HostName) 351 | certb, privb := createCert(conf.HostName) 352 | 353 | // Create a TLS certificate from it 354 | tlscert, err := tls.X509KeyPair(certb, privb) 355 | if err != nil { 356 | panic("tls.X509KeyPair: " + err.Error()) 357 | } 358 | 359 | // Report our network address and certificate to the parent process 360 | myHost := testHost{ 361 | Name: conf.HostName, 362 | Addr: tcpl.Addr().String(), 363 | Cert: certb, 364 | } 365 | if err := enc.Encode(myHost); err != nil { 366 | panic("Encode: " + err.Error()) 367 | } 368 | 369 | // Get the list of all host names, addresses, and certs from the parent 370 | host := []testHost{} 371 | if err := dec.Decode(&host); err != nil { 372 | panic("Decode: " + err.Error()) 373 | } 374 | 375 | // Create a certificate pool containing all nodes' certificates 376 | pool := x509.NewCertPool() 377 | for i := range host { 378 | if !pool.AppendCertsFromPEM(host[i].Cert) { 379 | panic("failed to append cert from " + host[i].Name) 380 | } 381 | } 382 | 383 | //println("hostName", conf.HostName, "pool", len(pool.Subjects())) 384 | 385 | // Listen and accept TCP/TLS connections 386 | donegrp := &sync.WaitGroup{} 387 | go func() { 388 | for { 389 | // Accept a TCP connection 390 | tcpc, err := tcpl.Accept() 391 | if err != nil { 392 | panic("Accept: " + err.Error()) 393 | } 394 | 395 | // Launch a goroutine to process it 396 | donegrp.Add(1) 397 | go n.acceptNetwork(tcpc, &tls.Config{ 398 | RootCAs: pool, 399 | Certificates: []tls.Certificate{tlscert}, 400 | ServerName: conf.HostName, 401 | ClientAuth: tls.RequireAndVerifyClientCert, 402 | ClientCAs: pool, 403 | }, host, donegrp) 404 | } 405 | }() 406 | 407 | // Open TCP and optionally TLS connections to each peer 408 | //println(self, "open TLS connections to", len(host), "peers") 409 | stepgrp := &sync.WaitGroup{} 410 | for i := range host { 411 | // Open an authenticated TLS connection to peer i 412 | peerConf := tls.Config{ 413 | RootCAs: pool, 414 | Certificates: []tls.Certificate{tlscert}, 415 | ServerName: conf.HostName, 416 | ClientAuth: tls.RequireAndVerifyClientCert, 417 | ClientCAs: pool, 418 | } 419 | peerConf.ServerName = host[i].Name 420 | //println(self, "Dial", host[i].Name, host[i].Addr) 421 | var conn net.Conn 422 | if UseTLS { 423 | conn, err = tls.Dial("tcp", host[i].Addr, &peerConf) 424 | } else { 425 | conn, err = net.Dial("tcp", host[i].Addr) 426 | } 427 | if err != nil { 428 | panic("Dial: " + err.Error()) 429 | } 430 | 431 | // Tell the server which client we are. 432 | enc := gob.NewEncoder(conn) 433 | if err := enc.Encode(self); err != nil { 434 | panic("gob.Encode: " + err.Error()) 435 | } 436 | 437 | // Set up a peer sender object. 438 | // It signals stepgrp.Done() after enough steps pass. 439 | stepgrp.Add(1) 440 | n.peer[i] = &testPeer{enc, stepgrp, conn} 441 | } 442 | //println(self, "opened TLS connections") 443 | 444 | // Start the consensus test 445 | n.advanceTLC(0) 446 | 447 | // Now we can let the receive goroutines process incoming messages 448 | n.mutex.Unlock() 449 | 450 | // Wait to finish enough consensus rounds 451 | //println(self, "wait for test to complete") 452 | stepgrp.Wait() 453 | 454 | // Report our observed consensus history to the parent 455 | if err := enc.Encode(n.choice); err != nil { 456 | panic("Encode: " + err.Error()) 457 | } 458 | 459 | // Finally, wait for our parent to signal when the test is complete. 460 | if err := dec.Decode(&struct{}{}); err != nil { 461 | panic("Decode: " + err.Error()) 462 | } 463 | 464 | //println(self, "child finished") 465 | } 466 | 467 | // Accept a new TLS connection on a TCP server socket. 468 | func (n *Node) acceptNetwork(conn net.Conn, tlsConf *tls.Config, 469 | host []testHost, donegrp *sync.WaitGroup) { 470 | 471 | // Enable TLS on the connection and run the handshake. 472 | if UseTLS { 473 | conn = tls.Server(conn, tlsConf) 474 | } 475 | defer func() { conn.Close() }() 476 | 477 | // Receive the client's nodenumber indication 478 | dec := gob.NewDecoder(conn) 479 | var peer int 480 | if err := dec.Decode(&peer); err != nil { 481 | println(n.self, "acceptNetwork gob.Decode: "+err.Error()) 482 | return 483 | //panic("acceptNetwork gob.Decode: " + err.Error()) 484 | } 485 | if peer < 0 || peer >= len(host) { 486 | println("acceptNetwork: bad peer number") 487 | return 488 | } 489 | 490 | // Authenticate the client with TLS. 491 | // XXX Why doesn't VerifyHostname work to verify a client auth? 492 | // Go TLS bug to report? 493 | //if err := tlsc.VerifyHostname(host[peer].Name); err != nil { 494 | // panic("VerifyHostname: " + err.Error()) 495 | //} 496 | if UseTLS { 497 | cs := conn.(*tls.Conn).ConnectionState() 498 | if len(cs.PeerCertificates) < 1 { 499 | println("acceptNetwork: no certificate from client") 500 | return 501 | } 502 | err := cs.PeerCertificates[0].VerifyHostname(host[peer].Name) 503 | if err != nil { 504 | println("VerifyHostname: " + err.Error()) 505 | return 506 | } 507 | } 508 | 509 | // Receive and process arriving messages 510 | n.runReceiveNetwork(peer, dec, donegrp) 511 | } 512 | 513 | // Receive messages from a connection and dispatch them into the TLC stack. 514 | func (n *Node) runReceiveNetwork(peer int, dec *gob.Decoder, 515 | grp *sync.WaitGroup) { 516 | for { 517 | // Get next message from this peer 518 | msg := Message{} 519 | err := dec.Decode(&msg) 520 | if err == io.EOF { 521 | break 522 | } else if err != nil { 523 | panic("receiveCausal:" + err.Error()) 524 | } 525 | //println(n.self, n.tmpl.Step, "runReceiveNetwork: recv from", 526 | // msg.From, "type", msg.Typ, "seq", msg.Seq, 527 | // "step", msg.Step) 528 | 529 | // Optionally insert random delays on a message basis 530 | time.Sleep(time.Duration(mrand.Int63n(int64(MaxSleep + 1)))) 531 | 532 | grp.Add(1) 533 | go n.receiveNetwork(&msg, grp) 534 | } 535 | grp.Done() // signal that we're done 536 | } 537 | 538 | func (n *Node) receiveNetwork(msg *Message, grp *sync.WaitGroup) { 539 | 540 | // Keep the stack single-threaded. 541 | n.mutex.Lock() 542 | defer func() { 543 | n.mutex.Unlock() 544 | grp.Done() 545 | }() 546 | 547 | // Dispatch up to the causal ordering layer 548 | //println(n.self, n.tmpl.Step, "receiveNetwork from", msg.From, 549 | // "type", msg.Typ, "seq", msg.Seq, "vec", len(msg.Vec)) 550 | n.receiveCausal(msg) 551 | } 552 | 553 | type testPeer struct { 554 | e *gob.Encoder 555 | w *sync.WaitGroup 556 | c io.Closer 557 | } 558 | 559 | func (tp *testPeer) Send(msg *Message) { 560 | if tp.e != nil { 561 | //println("testPeer.Send seq", msg.Seq, "step", msg.Step, 562 | // "MaxSteps", MaxSteps) 563 | if err := tp.e.Encode(msg); err != nil { 564 | println("Encode:", err.Error()) 565 | } 566 | } 567 | if tp.w != nil && MaxSteps > 1 && msg.Step >= MaxSteps { 568 | //println("testPeer.Send done") 569 | tp.w.Done() 570 | tp.w = nil 571 | } 572 | } 573 | -------------------------------------------------------------------------------- /go/dist/doc.go: -------------------------------------------------------------------------------- 1 | // Package dist implements a minimalistic distributed implementation 2 | // of TLC and QSC for the non-Byzantine (fail-stop) threat model. 3 | // It uses TLS/TCP for communication, gob encoding for serialization, and 4 | // vector time and a basic causal ordering protocol using vector time. 5 | package dist 6 | -------------------------------------------------------------------------------- /go/dist/node.go: -------------------------------------------------------------------------------- 1 | package dist 2 | 3 | import ( 4 | "sync" 5 | ) 6 | 7 | // Threshold is the TLC and consensus threshold 8 | var Threshold int 9 | 10 | // MaxTicket is the Amount of entropy in lottery tickets 11 | var MaxTicket int32 = 100 12 | 13 | // Type of message 14 | type Type int 15 | 16 | const ( 17 | // Prop is a raw unwitnessed proposal 18 | Prop Type = iota 19 | // Ack is an acknowledgment of a proposal 20 | Ack 21 | // Wit is a threshold witness confirmation of proposal 22 | Wit 23 | ) 24 | 25 | // Message over the network 26 | type Message struct { 27 | // Network/peering layer 28 | 29 | // From designates the node which originally sent this message 30 | From int 31 | 32 | // Causality layer 33 | // Seq is the Node-local sequence number for vector time 34 | Seq int 35 | // Vev is the Vector clock update from sender node 36 | Vec vec 37 | 38 | // Threshold time (TLC) layer 39 | // Step is the logical time step this message is for 40 | Step int 41 | // typ is the message type 42 | Typ Type 43 | // Prop is the proposal Seq this Ack or Wit is about 44 | Prop int 45 | // Ticket is the genetic fitness ticket for this proposal 46 | Ticket int32 47 | } 48 | 49 | // Node definition 50 | type Node struct { 51 | // Network/peering layer 52 | self int // This node's participant number 53 | peer []peer // How to send messages to each peer 54 | mutex sync.Mutex // Mutex protecting node's protocol stack 55 | 56 | // Causal history layer 57 | mat []vec // Node's current matrix clock 58 | oom [][]*Message // Out-of-order messages not yet delivered 59 | seqLog [][]*Message // Nodes' message received and delivered by seq 60 | saw []set // Messages each node saw recently 61 | wit []set // Witnessed messages each node saw recently 62 | 63 | // Threshold time (TLC) layer 64 | tmpl Message // Template for messages we send 65 | save int // Earliest step for which we maintain history 66 | acks set // Acknowledgments we've received in this step 67 | wits set // Threshold witnessed messages seen this step 68 | stepLog [][]logEntry // Nodes' messages seen by start of recent steps 69 | 70 | // This node's record of QSC consensus history 71 | choice []choice // Best proposal this node chose each round 72 | } 73 | 74 | type peer interface { 75 | Send(msg *Message) 76 | } 77 | 78 | // Info each node logs about other nodes' views at the start of each time-step 79 | type logEntry struct { 80 | saw set // All nodes' messages the node had seen by then 81 | wit set // Threshold witnessed messages it had seen 82 | } 83 | 84 | // Record of one node's QSC decision in one time-step 85 | type choice struct { 86 | best int // Best proposal this node chose in this round 87 | commit bool // Whether node observed successful commitment 88 | } 89 | 90 | func (n *Node) init(self int, peer []peer) { 91 | n.self = self 92 | n.peer = peer 93 | 94 | n.initCausal() 95 | n.initTLC() 96 | } 97 | -------------------------------------------------------------------------------- /go/dist/qsc.go: -------------------------------------------------------------------------------- 1 | package dist 2 | 3 | // RoundSteps is three because the witnessed QSC requires three TLC 4 | // time-steps per consensus round. 5 | const RoundSteps = 3 6 | 7 | // The TLC layer upcalls this method on advancing to a new time-step, 8 | // with sets of proposals seen (saw) and threshold witnessed (wit) recently. 9 | func (n *Node) advanceQSC(saw, wit set) { 10 | //println(n.self, n.tmpl.Step, "advanceQSC saw", len(saw), 11 | // "wit", len(wit)) 12 | 13 | // Calculate the starting step of the round that's just now completing. 14 | s := n.tmpl.Step - RoundSteps 15 | if s < 0 { 16 | return // Nothing to be done until the first round completes 17 | } 18 | 19 | // Find the best eligible proposal that was broadcast at s+0 20 | // and that is in our view by the end of the round at s+3. 21 | var bestProp *Message 22 | var bestTicket int32 23 | for p := range wit { 24 | if p.Typ != Prop { 25 | panic("wit should contain only proposals") 26 | } 27 | if p.Step == s+0 && p.Ticket >= bestTicket { 28 | bestProp = p 29 | bestTicket = p.Ticket 30 | } 31 | } 32 | 33 | // Determine if we can consider this proposal permanently committed. 34 | spoiled := n.spoiledQSC(s, saw, bestProp, bestTicket) 35 | reconfirmed := n.reconfirmedQSC(s, wit, bestProp) 36 | committed := !spoiled && reconfirmed 37 | 38 | // Record the consensus results for this round (from s to s+3). 39 | n.choice = append(n.choice, choice{bestProp.From, committed}) 40 | //println(n.self, n.tmpl.Step, "choice", bestProp.From, "spoiled", spoiled, 41 | // "reconfirmed", reconfirmed, "committed", committed) 42 | 43 | // Don't bother saving history before the start of the next round. 44 | n.save = s + 1 45 | } 46 | 47 | // Return true if there's another proposal competitive with a given candidate. 48 | func (n *Node) spoiledQSC(s int, saw set, prop *Message, ticket int32) bool { 49 | for p := range saw { 50 | if p.Step == s+0 && p.Typ == Prop && p != prop && 51 | p.Ticket >= ticket { 52 | return true // victory spoiled by competition! 53 | } 54 | } 55 | return false 56 | } 57 | 58 | // Return true if given proposal was doubly confirmed (reconfirmed). 59 | func (n *Node) reconfirmedQSC(s int, wit set, prop *Message) bool { 60 | for p := range wit { // search for a paparazzi witness at s+1 61 | if p.Step == s+1 && n.stepLog[p.From][s+1].wit.has(prop) { 62 | return true 63 | } 64 | } 65 | return false 66 | } 67 | -------------------------------------------------------------------------------- /go/dist/set.go: -------------------------------------------------------------------------------- 1 | package dist 2 | 3 | // Use a map to represent a set of messages 4 | type set map[*Message]struct{} 5 | 6 | // Test if msg is in set s. 7 | func (s set) has(msg *Message) bool { 8 | _, present := s[msg] 9 | return present 10 | } 11 | 12 | // Add msg to set s. 13 | func (s set) add(msg *Message) { 14 | s[msg] = struct{}{} 15 | } 16 | 17 | // Return a copy of message set s, 18 | // dropping any messages before earliest. 19 | func (s set) copy(earliest int) set { 20 | n := make(set) 21 | for k, v := range s { 22 | if k.Step >= earliest { 23 | n[k] = v 24 | } 25 | } 26 | return n 27 | } 28 | -------------------------------------------------------------------------------- /go/dist/tlc.go: -------------------------------------------------------------------------------- 1 | package dist 2 | 3 | import ( 4 | "math/rand" 5 | ) 6 | 7 | // Initialize the TLC layer state in a Node 8 | func (n *Node) initTLC() { 9 | n.tmpl = Message{From: n.self, Step: -1} 10 | n.stepLog = make([][]logEntry, len(n.peer)) 11 | } 12 | 13 | // Broadcast a copy of our current message template to all nodes 14 | func (n *Node) broadcastTLC() *Message { 15 | 16 | //println(n.self, n.tmpl.Step, "broadcast", msg, "typ", msg.Typ) 17 | msg := n.tmpl 18 | n.broadcastCausal(&msg) 19 | return &msg 20 | } 21 | 22 | // Unicast an acknowledgment of a given proposal to its sender 23 | func (n *Node) acknowledgeTLC(prop *Message) { 24 | 25 | msg := n.tmpl 26 | msg.Typ = Ack 27 | msg.Prop = prop.Seq 28 | n.sendCausal(prop.From, &msg) 29 | } 30 | 31 | // Advance to a new time step. 32 | func (n *Node) advanceTLC(step int) { 33 | //println(n.self, step, "advanceTLC", 34 | // "saw", len(n.saw[n.self]), "wit", len(n.wit[n.self])) 35 | 36 | // Initialize our message template for new time step 37 | n.tmpl.Step = step // Advance to new time step 38 | n.tmpl.Typ = Prop // Raw unwitnessed proposal message initially 39 | n.tmpl.Ticket = rand.Int31n(MaxTicket) // Choose a ticket 40 | 41 | n.acks = make(set) // No acknowledgments received yet in this step 42 | n.wits = make(set) // No threshold witnessed messages received yet 43 | 44 | // Notify the upper (QSC) layer of the advancement of time, 45 | // and let it fill in its part of the new message to broadcast. 46 | n.advanceQSC(n.saw[n.self], n.wit[n.self]) 47 | 48 | prop := n.broadcastTLC() // broadcast our raw proposal 49 | n.tmpl.Prop = prop.Seq // save proposal's sequence number 50 | n.acks.add(prop) // automatically self-acknowledge it 51 | } 52 | 53 | func (n *Node) receiveTLC(msg *Message) { 54 | 55 | // Now process this message according to type. 56 | //println(n.self, n.tmpl.Step, "receivedTLC from", msg.From, 57 | // "step", msg.Step, "typ", msg.Typ) 58 | switch msg.Typ { 59 | case Prop: // A raw unwitnessed proposal broadcast. 60 | 61 | // Record the set of messages this node had seen 62 | // by the time it advanced to this new time-step. 63 | if len(n.stepLog[msg.From]) != msg.Step { 64 | panic("out of sync") 65 | } 66 | n.stepLog[msg.From] = append(n.stepLog[msg.From], 67 | logEntry{n.saw[msg.From], n.wit[msg.From]}) 68 | 69 | // Continue from pruned copies in the next time step 70 | n.saw[msg.From] = n.saw[msg.From].copy(n.save) 71 | n.wit[msg.From] = n.wit[msg.From].copy(n.save) 72 | 73 | if msg.Step == n.tmpl.Step { 74 | //println(n.self, n.tmpl.Step, "ack", msg.From) 75 | n.acknowledgeTLC(msg) 76 | } 77 | 78 | case Ack: // An acknowledgment. Collect a threshold of acknowledgments. 79 | if msg.Prop == n.tmpl.Prop { // only if it acks our proposal 80 | n.acks.add(msg) 81 | //println(n.self, n.tmpl.Step, "got ack", len(n.acks)) 82 | if n.tmpl.Typ == Prop && len(n.acks) >= Threshold { 83 | 84 | // Broadcast a threshold-witnesed certification 85 | n.tmpl.Typ = Wit 86 | n.broadcastTLC() 87 | } 88 | } 89 | 90 | case Wit: // A threshold-witnessed message. Collect a threshold of them. 91 | prop := n.seqLog[msg.From][msg.Prop] 92 | if prop.Typ != Prop { 93 | panic("doesn't refer to a proposal!") 94 | } 95 | if msg.Step == n.tmpl.Step { 96 | 97 | // Collect a threshold of Wit witnessed messages. 98 | n.wits.add(prop) // witnessed messages in this step 99 | if len(n.wits) >= Threshold { 100 | 101 | // We've met the condition to advance time. 102 | n.advanceTLC(n.tmpl.Step + 1) 103 | } 104 | } 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /go/dist/vec.go: -------------------------------------------------------------------------------- 1 | package dist 2 | 3 | // Vector timestemp 4 | type vec []int 5 | 6 | // Return a copy of this vector 7 | func (v vec) copy() vec { 8 | return append(vec{}, v...) 9 | } 10 | 11 | // Return true if vector timestamp v is causally before or equal to y. 12 | func (v vec) le(y vec) bool { 13 | for i := range v { 14 | if v[i] > y[i] { 15 | return false 16 | } 17 | } 18 | return true 19 | } 20 | 21 | // Set v to the elementwise maximum of vectors x and y. 22 | // Inputs x and/or y can be the same as target v. 23 | func (v vec) max(x, y vec) { 24 | for i := range v { 25 | if x[i] > y[i] { 26 | v[i] = x[i] 27 | } else { 28 | v[i] = y[i] 29 | } 30 | } 31 | } 32 | 33 | //func (v vec) String() { 34 | // fmt.Sprintf("%v", []int(v)) 35 | //} 36 | -------------------------------------------------------------------------------- /go/lib/backoff/retry.go: -------------------------------------------------------------------------------- 1 | // This package converts errors into time delays via random exponential backoff. 2 | // It is designed to be extremely simple to use but robust and automatic. 3 | // 4 | package backoff 5 | 6 | import ( 7 | "context" 8 | "log" 9 | "math/rand" 10 | "time" 11 | ) 12 | 13 | // Retry calls try() repeatedly until it returns without an error, 14 | // with the default exponential backoff configuration. 15 | // 16 | // By default, Retry continues to try forever until it succeeds. 17 | // The caller may pass a cancelable context in the ctx parameter, however, 18 | // in case Retry will give up calling try when the context is cancelled. 19 | // If the context was already cancelled on the call to Retry, 20 | // then Retry returns ctx.Err() immediately without calling try. 21 | // 22 | func Retry(ctx context.Context, try func() error) error { 23 | return Config{}.Retry(ctx, try) 24 | } 25 | 26 | // Config represents configuration parameters for exponential backoff. 27 | // To use, initialize a Config structure with the desired parameters 28 | // and then call Config.Retry(). 29 | // 30 | // Report, if non-nil, is a function called by Retry to report errors 31 | // in an appropriate fashion specific to the application. 32 | // If nil, Retry reports errors via log.Println by default. 33 | // Report may also return a non-nil error to abort the Retry loop if it 34 | // determines that the detected error is permanent and waiting will not help. 35 | // 36 | type Config struct { 37 | Report func(error) error // Function to report errors 38 | MaxWait time.Duration // Maximum backoff wait period 39 | 40 | mayGrow struct{} // Ensure Config remains extensible 41 | } 42 | 43 | func defaultReport(err error) error { 44 | log.Println(err.Error()) 45 | return nil 46 | } 47 | 48 | // Retry calls try() repeatedly until it returns without an error, 49 | // using exponential backoff configuration c. 50 | func (c Config) Retry(ctx context.Context, try func() error) error { 51 | 52 | // Make sure we have a valid error reporter 53 | if c.Report == nil { 54 | c.Report = defaultReport 55 | } 56 | 57 | // Return immediately if ctx was already cancelled 58 | if ctx.Err() != nil { 59 | return ctx.Err() 60 | } 61 | 62 | backoff := time.Duration(1) // minimum backoff duration 63 | for { 64 | before := time.Now() 65 | err := try() 66 | if err == nil { // success 67 | return nil 68 | } 69 | elapsed := time.Since(before) 70 | 71 | // Report the error as appropriate 72 | err = c.Report(err) 73 | if err != nil { 74 | return err // abort the retry loop 75 | } 76 | 77 | // Wait for an exponentially-growing random backoff period, 78 | // with the duration of each operation attempt as the minimum 79 | if backoff <= elapsed { 80 | backoff = elapsed 81 | } 82 | backoff += time.Duration(rand.Int63n(int64(backoff))) 83 | if c.MaxWait > 0 && backoff > c.MaxWait { 84 | backoff = c.MaxWait 85 | } 86 | 87 | // Wait for either the backoff timer or a cancel signal. 88 | t := time.NewTimer(backoff) 89 | select { 90 | case <-t.C: // Backoff timer expired 91 | continue 92 | 93 | case <-ctx.Done(): // Our context got cancelled 94 | t.Stop() 95 | return ctx.Err() 96 | } 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /go/lib/backoff/retry_test.go: -------------------------------------------------------------------------------- 1 | package backoff 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "testing" 8 | "time" 9 | ) 10 | 11 | func TestRetry(t *testing.T) { 12 | 13 | n := 0 14 | try := func() error { 15 | n++ 16 | if n < 30 { 17 | return errors.New(fmt.Sprintf("test error %d", n)) 18 | } 19 | return nil 20 | } 21 | Retry(context.Background(), try) 22 | } 23 | 24 | func TestTimeout(t *testing.T) { 25 | 26 | ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) 27 | try := func() error { 28 | return errors.New("haha, never going to succeed") 29 | } 30 | if err := Retry(ctx, try); err != context.DeadlineExceeded { 31 | t.Errorf("got wrong error from Retry: %v", err.Error()) 32 | } 33 | 34 | // Now test with an already-cancelled context 35 | try = func() error { 36 | panic("shouldn't get here!") 37 | } 38 | if err := Retry(ctx, try); err != context.DeadlineExceeded { 39 | t.Errorf("got wrong error from Retry: %v", err.Error()) 40 | } 41 | 42 | // for good measure 43 | cancel() 44 | } 45 | -------------------------------------------------------------------------------- /go/lib/backoff/rfq/doc.go: -------------------------------------------------------------------------------- 1 | // Package rfq implements responsively-fair queueing (RFQ). 2 | // or distributed queueing? RFDQ 3 | // 4 | // If a server has limited resources to serve a potentially-unlimited 5 | // number of clients (especially in a flash crowd or DDoS attack setting), 6 | // we wish to allocate the server's limited resources fairly among clients, 7 | // so that (for example) fast clients cannot indefinitely starve slower ones. 8 | // This definition of fairness implies wait-freedom, i.e., lack of starvation. 9 | // We would like the server to be able to serve an arbitrary number of clients 10 | // in a fair or at least starvation-free way, with only constant server state. 11 | // 12 | // One approach is for the server to organize the clients into a literal queue, 13 | // with each client responsible for remembering who is next in the queue, 14 | // so the space required for next pointers is distributed among the clients. 15 | // This is what queue-based multiprocessor shared memory locking algorithms do. 16 | // But it works only when the clients are perfectly reliable and trustworthy: 17 | // if not, a single crashed client breaks the chain 18 | // and leaves all clients waiting behind it "dangling" and blocked forever 19 | // (at last without introducing timeouts or similar recovery mechanisms). 20 | // 21 | // Another baseline approach is to have all clients run a backoff algorithm 22 | // when they submit a request that the server must reject due to a full queue. 23 | // This approach might be statistically fair and starvation-free 24 | // if all the clients have similar processing speed and connectivity, 25 | // but clients that are much faster than others can starve slow clients. 26 | // This is because if some number of fast clients can saturate the server, 27 | // re-filling the server's queue only a brief moment after space opens up, 28 | // and a slow client's network round-trip time and/or client-side delay 29 | // add up to significantly more than the server's work-item processing time, 30 | // then each time the slow client attempts to retry it will always find 31 | // that the server's queue has already been filled again by the fast clients. 32 | // 33 | // A next-step solution to ensure approximate fairness across all clients 34 | // would be for the server to propagate maximum backoff delays among clients. 35 | // For example, suppose a slow client attempts to submit a request, 36 | // is rejected due to a full server queue, resubmits it t ms later, 37 | // and is again rejected, increasing its backoff timer to 2t ms. 38 | // If the original t ms value was dominated by client-side or network delays, 39 | // and the work-item processing times for fast clients is significantly less, 40 | // then with independent backoff delays the slow client will be starved. 41 | // But if the server notices that the slow client has backed off to 2ms, 42 | // and in response forces *all* clients to use a maximum backoff of 2ms 43 | // until the slow client's request has been satisfied, 44 | // then the slow client will no longer be starved 45 | // and allocation of the server's resources will be approximately fair. 46 | // 47 | // This approach fails to be responsive, however: the server's response times 48 | // to fast clients are slowed to that of the slowest client at a given time. 49 | // This approach can also greatly underutilize the server's resources: 50 | // the server may be perfectly able to process many work-items every 2ms, 51 | // but has slowed itself down to the rate of the slowest client for fairness. 52 | // Pursuing such strong fairness also creates DoS attack vectors, 53 | // since it is trivial for a misbehaved client simply to pretend to be slow. 54 | // In practice we cannot achieve both perfect fairness and responsiveness: 55 | // utilizing the server's full capacity to service fast clients quickly 56 | // inherently means that fast clients obtain more resources than slow clients. 57 | // But we would still like to be "reasonably" fair while also responsive, 58 | // and particualrly to ensure that no client, however slow, is starved. 59 | // 60 | // RFQ thus provides "responsively-fair queueing", 61 | // which ensures statistical fairness among clients that see similar delays, 62 | // and similarly ensures fairness among clients in different delay classes. 63 | // ... 64 | // 65 | // Server has a limited internal queue, which it keeps sorted 66 | // oldest-request-first as judged by the server's own clock. 67 | // An externally-queued request can bump an internally-queued request 68 | // if the server has previously outsourced it to the client and forgotten it 69 | // but its approximate service time has arrived and the client resubmitted it. 70 | // 71 | // Tolerating misbehaving (Byzantine) clients: 72 | // use server-side MAC (and optionally encryption) to protect the state 73 | // the server outsources to clients. 74 | // 75 | // Issue: replay attacks, since server doesn't have storage to remember 76 | // which tokens have and haven't been "used" or how many times. 77 | // Full processing might reveal and neutralize the effect of a replay -- 78 | // e.g., where a cryptocurrency server finds a UTXO was already spent -- 79 | // but full processing might be significantly more costly in resources. 80 | // One simple defense is to have the server keep a record (e.g., hash table) 81 | // of all the tokens that have been processed within some past epoch. 82 | // The server can then detect and trivially discard replays within one epoch, 83 | // thereby rate-limiting the effectiveness of token replays to one per epoch. 84 | // This takes storage linear in the epoch length and server's processing rate, 85 | // but is independent of the number clients contending to submit requests. 86 | // 87 | // If it is acceptable to impose a maximum round-trip delay on any client, 88 | // denying service to clients that can't resubmit a request within one epoch, 89 | // then the server can presume requests from earlier epochs to be replays 90 | // and reject them unconditionally, thereby eliminating replay attacks. 91 | // 92 | package rfq 93 | -------------------------------------------------------------------------------- /go/lib/cas/cas.go: -------------------------------------------------------------------------------- 1 | // Package cas defines a simple compare-and-set (CAS) state interface. 2 | // It defines a generic access interface called Store, 3 | // and a simple in-memory CAS register called Register. 4 | // 5 | package cas 6 | 7 | import ( 8 | "context" 9 | "sync" 10 | ) 11 | 12 | // Store defines a CAS storage abstraction via a single CompareAndSet method. 13 | // 14 | // CompareAndSet writes a proposed new value to the state, 15 | // provided the state still has the specified old value. 16 | // The compare and conditional write are guaranteed to be atomic, 17 | // ensuring that the caller can avoid undetected state loss due to races. 18 | // CompareAndSet then reads and returns the latest actual state value. 19 | // 20 | // State values are arbitrary opaque Go strings, and may contain binary data. 21 | // While values in principle have no particular length limit, in practice 22 | // Store implementations may expect them to be "reasonably small", i.e., 23 | // efficient for storing metadata but not necessarily for bulk data storage. 24 | // 25 | // The Store assigns a version number to each value CompareAndSet returns. 26 | // Version numbers must be monotonic but need not be assigned consecutively. 27 | // The version number must increase when the stored value changes, 28 | // and may increase at other times even when the value hasn't changed. 29 | // The caller may simply ignore the version numbers CompareAndSet returns, 30 | // or may use them for consistency-checking and debugging: 31 | // see the Checked wrapper function in the test subpackage for example. 32 | // Version numbers do not impose a burden on Store interface implementations, 33 | // in part because it's easy to adapt a non-versioned underlying CAS interface 34 | // with a simple wrapper that attaches a version number to each proposed value. 35 | // 36 | // CompareAndSet takes a Context parameter so that long-running implementations, 37 | // particularly those accessing remote storage in a distributed system, 38 | // can respond to cancellation requests and timeouts appropriately. 39 | // For robust asynchronous operation, CompareAndSet should return err != nil 40 | // only when its context is cancelled or when it encounters an error 41 | // that it detects to be permanent and unrecoverable for sure. 42 | // On encountering errors that may be temporary (e.g., due to network outages), 43 | // it is better for the Store to keep trying until success or cancellation, 44 | // using the lib/backoff package for example. 45 | // 46 | type Store interface { 47 | CompareAndSet(ctx context.Context, old, new string) ( 48 | version int64, actual string, err error) 49 | } 50 | 51 | // Register implements a simple local-memory CAS register. 52 | // It is thread-safe and ready for use on instantiation. 53 | type Register struct { 54 | mut sync.Mutex // for synchronizing accesses 55 | ver int64 // version number of the latest value 56 | val string // the latest value written 57 | } 58 | 59 | // CompareAndSet implements the Store interface for the CAS register. 60 | func (r *Register) CompareAndSet(ctx context.Context, old, new string) ( 61 | version int64, actual string, err error) { 62 | 63 | r.mut.Lock() 64 | defer r.mut.Unlock() 65 | 66 | // Update the value only if the current value is as expected. 67 | if r.val == old { 68 | r.ver, r.val = r.ver+1, new 69 | } 70 | 71 | // Return the actual new value, changed or not. 72 | return r.ver, r.val, nil 73 | } 74 | 75 | -------------------------------------------------------------------------------- /go/lib/cas/test/cas.go: -------------------------------------------------------------------------------- 1 | // Package test implements shareable code for testing instantiations 2 | // of the cas.Store check-and-set storage interface. 3 | package test 4 | 5 | import ( 6 | "context" 7 | "fmt" 8 | "math/rand" 9 | "sync" 10 | "testing" 11 | 12 | "github.com/dedis/tlc/go/lib/cas" 13 | ) 14 | 15 | // History records a history of cas.Store version/value observations, 16 | // typically made across concurrent goroutines or even distributed nodes, 17 | // and checks all these observations for consistency. 18 | // 19 | type History struct { 20 | hist map[int64]string // version-value map defining observed history 21 | mut sync.Mutex // mutex protecting this reference order 22 | } 23 | 24 | // Observe records an old/new value pair that was observed via a cas.Store, 25 | // checks it for consistency against all prior recorded old/new value pairs, 26 | // and reports any errors via testing context t. 27 | // 28 | func (to *History) Observe(t *testing.T, version int64, value string) { 29 | to.mut.Lock() 30 | defer to.mut.Unlock() 31 | 32 | // Create the version/value map if it doesn't already exist 33 | if to.hist == nil { 34 | to.hist = make(map[int64]string) 35 | } 36 | 37 | // If there is a recorded value for this version, it must be the same. 38 | if old, exist := to.hist[version]; exist && old != value { 39 | t.Errorf("\nInconsistency:\n ver %v\n old %q\n new %q\n", 40 | version, old, value) 41 | } 42 | 43 | // Record the new successor 44 | to.hist[version] = value 45 | } 46 | 47 | // Checked wraps the provided CAS store with a consistency-checker 48 | // that records all requested and observed accesses against history h, 49 | // reporting any inconsistency errors discovered via testing context t. 50 | // 51 | // The wrapper also consistency-checks the caller's accesses to the Store, 52 | // e.g., that the provided old value is indeed the last version retrieved. 53 | // This means that when checking a Store that is shared across goroutines, 54 | // each goroutine must have its own Checked wrapper around that Store. 55 | // 56 | func Checked(t *testing.T, h *History, store cas.Store) cas.Store { 57 | return &checkedStore{t: t, h: h, s: store} 58 | } 59 | 60 | type checkedStore struct { 61 | t *testing.T // Testing context 62 | h *History // History we're using for consistency-checking 63 | s cas.Store // Underlying compare-and-set Store 64 | 65 | lver int64 // Last version number read from the underlying Store 66 | lval string // Last value read from the underlying Store 67 | 68 | rver int64 // Our fudged informational version numbers for testing 69 | } 70 | 71 | func (cs *checkedStore) CompareAndSet(ctx context.Context, old, new string) ( 72 | version int64, actual string, err error) { 73 | 74 | // Sanity-check the arguments we're passed 75 | if old != cs.lval { 76 | cs.t.Errorf("CompareAndSet: wrong old value %q != %q", 77 | old, cs.lval) 78 | } 79 | if new == "" { 80 | cs.t.Errorf("CompareAndSet: new value empty") 81 | } 82 | if new == old { 83 | cs.t.Errorf("CompareAndSet: new value identical to old") 84 | } 85 | 86 | // Try to change old to new atomically. 87 | version, actual, err = cs.s.CompareAndSet(ctx, old, new) 88 | 89 | // Sanity-check the Store-assigned version numbers 90 | if version < cs.lver { 91 | cs.t.Errorf("CompareAndSet: Store version number decreased") 92 | } 93 | if version == cs.lver && actual != cs.lval { 94 | cs.t.Errorf("CompareAndSet: Store version failed to increase") 95 | } 96 | 97 | // Record and consistency-check all version/value pairs we observe. 98 | cs.h.Observe(cs.t, version, actual) 99 | 100 | // Produce our own informational version numbers to return 101 | // that increase a bit unpredictability for testing purposes. 102 | if version > cs.lver { 103 | cs.rver++ 104 | } 105 | cs.rver += rand.Int63n(3) 106 | 107 | // Update our cached record of the underlying Store's last state 108 | cs.lver, cs.lval = version, actual 109 | 110 | // Return the actual new value regardless. 111 | return cs.rver, actual, err 112 | } 113 | 114 | // Stores torture-tests one or more cas.Store interfaces 115 | // that are all supposed to represent the same consistent underlying state. 116 | // The test is driven by nthreads goroutines per Store interface, 117 | // each of which performs naccesses CAS operations on its interface. 118 | // 119 | func Stores(t *testing.T, nthreads, naccesses int, store ...cas.Store) { 120 | 121 | bg := context.Background() 122 | wg := sync.WaitGroup{} 123 | h := &History{} 124 | 125 | tester := func(i, j int) { 126 | cs := Checked(t, h, store[i]) 127 | old, err := "", error(nil) 128 | for k := 0; k < naccesses; k++ { 129 | new := fmt.Sprintf("store %v thread %v access %v", 130 | i, j, k) 131 | //println("tester", i, j, "access", k) 132 | _, old, err = cs.CompareAndSet(bg, old, new) 133 | if err != nil { 134 | t.Error("CompareAndSet: " + err.Error()) 135 | } 136 | } 137 | //println("tester", i, j, "done") 138 | wg.Done() 139 | } 140 | 141 | // Launch a set of goroutines for each Store interface. 142 | // To maximize cross-store concurrency, 143 | // launch the first thread per store, then the second per store, etc. 144 | for j := 0; j < nthreads; j++ { 145 | for i := range store { 146 | wg.Add(1) 147 | go tester(i, j) 148 | } 149 | } 150 | 151 | // Wait for all tester goroutines to complete 152 | wg.Wait() 153 | } 154 | -------------------------------------------------------------------------------- /go/lib/cas/test/cas_test.go: -------------------------------------------------------------------------------- 1 | package test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/dedis/tlc/go/lib/cas" 7 | ) 8 | 9 | // Test the Client with a trivial in-memory key/value Store implementation. 10 | func TestRegister(t *testing.T) { 11 | Stores(t, 100, 100000, &cas.Register{}) 12 | } 13 | -------------------------------------------------------------------------------- /go/lib/doc.go: -------------------------------------------------------------------------------- 1 | // Sub-packages of this package contains common library functionality 2 | // useful in implementations of threshold logical clocks and consensus. 3 | package lib 4 | -------------------------------------------------------------------------------- /go/lib/fs/atomic/atomic.go: -------------------------------------------------------------------------------- 1 | // This package supports writing files atomically 2 | // while ensuring "at-most-once" semantics. 3 | package atomic 4 | 5 | import ( 6 | "errors" 7 | "fmt" 8 | "io/ioutil" 9 | "os" 10 | "path/filepath" 11 | ) 12 | 13 | // WriteFileOnce attempts to write data to filename atomically, only once, 14 | // failing with ErrExist if someone else already wrote a file at filename. 15 | // 16 | // Ensures that no one ever sees a zero-length or incomplete file 17 | // at the target filename, by writing data to a temporary file first, 18 | // synchronizing it to stable storage, then atomically linking it into place. 19 | // 20 | // This code solves a different problem from, but is partly inspired by: 21 | // https://github.com/google/renameio 22 | // https://github.com/natefinch/atomic 23 | 24 | func WriteFileOnce(filename string, data []byte, perm os.FileMode) error { 25 | 26 | // Create a temporary file in the target directory, 27 | // mainly to ensure that it's on the same volume for hard linking. 28 | dir, name := filepath.Split(filename) 29 | pattern := fmt.Sprintf("%s-*.tmp", name) 30 | tmpfile, err := ioutil.TempFile(dir, pattern) 31 | if err != nil { 32 | return err 33 | } 34 | 35 | // Make sure it gets closed and removed regardless of outcome. 36 | tmpname := tmpfile.Name() 37 | defer func() { 38 | tmpfile.Close() 39 | os.Remove(tmpname) 40 | }() 41 | 42 | // Write the data to the temporary file. 43 | n, err := tmpfile.Write(data) 44 | if err != nil { 45 | return err 46 | } 47 | if n < len(data) { 48 | return errors.New("short write") 49 | } 50 | 51 | // Set the correct file permissions 52 | if err := tmpfile.Chmod(perm); err != nil { 53 | return err 54 | } 55 | 56 | // Force the newly-written data to stable storage. 57 | // For background on this see commends for CloseAtomicallyReplace 58 | // at https://github.com/google/renameio/blob/master/tempfile.go 59 | // 60 | if err := tmpfile.Sync(); err != nil { 61 | return err 62 | } 63 | 64 | if err := tmpfile.Close(); err != nil { 65 | return err 66 | } 67 | 68 | // Atomically hard-link the temporary file into the target filename. 69 | // Unlike os.Rename, this fails if target filename already exists. 70 | if err := os.Link(tmpname, filename); err != nil { 71 | return err 72 | } 73 | 74 | return nil 75 | } 76 | -------------------------------------------------------------------------------- /go/lib/fs/atomic/atomic_test.go: -------------------------------------------------------------------------------- 1 | package atomic 2 | 3 | import ( 4 | "io/ioutil" 5 | "math/rand" 6 | "os" 7 | "sync" 8 | "testing" 9 | "time" 10 | ) 11 | 12 | func TestWriteFileOnce(t *testing.T) { 13 | 14 | filename := "testfile.tmp" 15 | var wg sync.WaitGroup 16 | writer := func(i int) { 17 | 18 | // Sleep a small random duration to jitter the test 19 | time.Sleep(time.Duration(rand.Int63n(int64(time.Microsecond)))) 20 | //println("thread",i,"writing") 21 | 22 | // Try to write the file 23 | b := make([]byte, i) // create i-length file filled with i's 24 | for j := range b { 25 | b[j] = byte(i) 26 | } 27 | err := WriteFileOnce(filename, b, 0644) 28 | if err != nil && !os.IsExist(err) { 29 | t.Error("WriteFileOnce:", err) 30 | } 31 | 32 | // Now try to read the file that got written 33 | b, err = ioutil.ReadFile(filename) 34 | if err != nil { 35 | t.Error("ReadFile", err) 36 | } 37 | 38 | // Check that what we read back is valid 39 | //println("thread",i,"read",len(b)) 40 | i = len(b) 41 | if i == 0 { 42 | t.Error("zero-length file shouldn't be possible") 43 | } 44 | for j := range b { 45 | if b[j] != byte(i) { 46 | t.Error("read file has wrong byte at", j) 47 | } 48 | } 49 | 50 | wg.Done() 51 | } 52 | 53 | // Test with increasing numbers of threads 54 | for n := 1; n <= 128; n *= 2 { 55 | 56 | //println("\ntesting", n, "threads") 57 | for i := 1; i <= n; i++ { 58 | wg.Add(1) 59 | go writer(i) 60 | } 61 | wg.Wait() 62 | 63 | os.Remove(filename) 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /go/lib/fs/casdir/state.go: -------------------------------------------------------------------------------- 1 | // Package casdir implements a versioned check-and-set (CAS) state abstraction 2 | // in a directory on a standard POSIX-compatible file system. 3 | // 4 | // See the tlc/go/lib/cas package for general information 5 | // on this CAS state abstraction. 6 | // 7 | // This implementation is just a simple wrapper around the verst package, 8 | // which provides a slightly-more-general versioned state abstraction. 9 | // To implement CAS, in essence, we simply expire old versions immediately 10 | // as soon as any new version is written. 11 | // 12 | package casdir 13 | 14 | import ( 15 | "context" 16 | 17 | "github.com/dedis/tlc/go/lib/fs/verst" 18 | ) 19 | 20 | // Store implements the compare-and-set state abstraction 21 | // generically defined by the cas.Store interface, 22 | // holding the underlying state in a POSIX directory. 23 | // 24 | // The underlying state directory may be shared locally or remotely 25 | // (e.g., via NFS-mounted file systems), 26 | // provided that file system accesses ensure file-level POSIX atomicity. 27 | // 28 | // Each Store instance is intended for use by only one goroutine at a time, 29 | // so the client must synchronize shared uses across multiple goroutines. 30 | // 31 | type Store struct { 32 | vs verst.State // underlying versioned state 33 | lver int64 // last version we've read 34 | lval string // application value associated with lver 35 | } 36 | 37 | // Init sets Store to refer to a CAS register at a given file system path. 38 | // If create is true, creates the designated directory if it doesn't exist. 39 | // If excl is true, fails if the designated directory already exists. 40 | // 41 | func (st *Store) Init(path string, create, excl bool) error { 42 | return st.vs.Init(path, create, excl) 43 | } 44 | 45 | // CompareAndSet writes value new provided the state still holds value old, 46 | // then reads and returns the actual current state version and value. 47 | // 48 | func (st *Store) CompareAndSet(ctx context.Context, old, new string) ( 49 | version int64, actual string, err error) { 50 | 51 | if old != st.lval { 52 | panic("CompareAndSet: wrong old value") 53 | } 54 | 55 | // Try to write the new version to the underlying versioned store - 56 | // but don't fret if someone else wrote it or if it has expired. 57 | ver := st.lver + 1 58 | err = st.vs.WriteVersion(ver, new) 59 | if err != nil && !verst.IsExist(err) && !verst.IsNotExist(err) { 60 | return 0, "", err 61 | } 62 | 63 | // Now read back whatever value was successfully written. 64 | val, err := st.vs.ReadVersion(ver) 65 | if err != nil && verst.IsNotExist(err) { 66 | 67 | // The requested version has probably been aged out, 68 | // so catch up to the most recent committed value. 69 | ver, val, err = st.vs.ReadLatest() 70 | } 71 | if err != nil { 72 | return 0, "", err 73 | } 74 | 75 | // Expire all versions before this latest one 76 | st.vs.Expire(ver) 77 | 78 | // Return the actual version and value that we read 79 | st.lver, st.lval = ver, val 80 | return ver, val, err 81 | } 82 | -------------------------------------------------------------------------------- /go/lib/fs/verst/state.go: -------------------------------------------------------------------------------- 1 | // Package verst implements a simple persistent versioned state abstraction 2 | // in a directory on a standard POSIX-compatible file system. 3 | // 4 | // The abstraction that verst presents is essentially a key/value store, 5 | // in which the keys are sequentially-increasing version numbers, 6 | // and the values are opaque byte strings (which we represent as Go strings). 7 | // The main operations verst provides are 8 | // reading a particular (or the latest) version, 9 | // and writing a new version as a successor to the latest version. 10 | // The implementation ensures that new version writes are atomic: 11 | // clients will never read partially-written values, for example. 12 | // If several clients attempt to write the same new version concurrently, 13 | // one will succeed while all the others will fail, 14 | // and potentially need to retry with respect to the new latest version. 15 | // 16 | // The package is designed assuming that values are small, 17 | // e.g., metadata rather than bulk data, appropriate for Go strings. 18 | // and reading/writing all at once as atomic units. 19 | // Bulk data should be handled by other means. 20 | // 21 | // The verst package uses simple atomic POSIX file system operations, 22 | // with no locking, to manage concurrency in the underlying file system. 23 | // It supports garbage-collection of old state versions 24 | // by using atomic POSIX directory-manipulation operations. 25 | // Barring bugs, it "should" not be possible to violate 26 | // the guaranteed atomicity properties or corrupt the state store 27 | // regardless of how many clients may be competing to access it 28 | // or with what access patterns or delays. 29 | // This atomicity is necessarily only as good as the underlying file system's 30 | // guarantee of atomicity and consistency of the underlying operations: 31 | // e.g., if the underlying file system can leave a rename operation 32 | // half-completed after a badly-timed crash, the state could be corrupted. 33 | // 34 | // The design of verst guarantees progress, but not fairness: 35 | // that is, by standard definitions it is lock-free but not wait free 36 | // (https://en.wikipedia.org/wiki/Non-blocking_algorithm). 37 | // Regardless of the amount of contention to write a new version, for example, 38 | // verst guarantees that at least one client will be able to make progress. 39 | // It makes no guarantee of a "fair" rotation among clients, however, 40 | // or that some particularly slow or otherwise unlucky client will not starve. 41 | // 42 | // While this package currently lives in the tlc repository, 43 | // it is not particularly specific to TLC and depends on nothing else in it, 44 | // and hence might eventually be moved to a more generic home if appropriate. 45 | // 46 | // XXX describe the techniques in a bit more detail. 47 | // 48 | package verst 49 | 50 | import ( 51 | "fmt" 52 | "io/ioutil" 53 | "os" 54 | "path/filepath" 55 | // "errors" 56 | 57 | "github.com/bford/cofo/cbe" 58 | "github.com/dedis/tlc/go/lib/fs/atomic" 59 | ) 60 | 61 | //const versPerGen = 100 // Number of versions between generation subdirectories 62 | const versPerGen = 10 // Number of versions between generation subdirectories 63 | 64 | const genFormat = "gen-%d" // Format for generation directory names 65 | const verFormat = "ver-%d" // Format for register version file names 66 | 67 | // State holds cached state for a single verst versioned register. 68 | type State struct { 69 | path string // Base pathname of directory containing register state 70 | genVer int64 // Version number of highest generation subdirectory 71 | genPath string // Pathname to generation subdirectory 72 | ver int64 // Highest register version known to exist already 73 | val string // Cached register value for highest known version 74 | expVer int64 // Version number before which state is expired 75 | } 76 | 77 | // Initialize State to refer to a verst register at a given file system path. 78 | // If create is true, create the designated directory if it doesn't exist. 79 | // If excl is true, fail if the designated directory already exists. 80 | func (st *State) Init(path string, create, excl bool) error { 81 | *st = State{path: path} // Set path and clear cached state 82 | 83 | // First check if the path already exists and is a directory. 84 | stat, err := os.Stat(path) 85 | switch { 86 | case err == nil && !stat.IsDir(): 87 | return os.ErrExist // already exists, but not a directory 88 | 89 | case err == nil && !excl: 90 | return st.refresh() // exists: load our cache from it 91 | 92 | case err != nil && (!IsNotExist(err) || !create): 93 | return err // didn't exist and we can't create it 94 | } 95 | 96 | // Create and initialize the version state directory, 97 | // initially with a temporary name for atomicity. 98 | dir, name := filepath.Split(path) 99 | if dir == "" { 100 | dir = "." // Ensure dir is nonempty 101 | } 102 | tmpPath, err := ioutil.TempDir(dir, name+"-*.tmp") 103 | if err != nil { 104 | return err 105 | } 106 | defer func() { // Clean up on return if we can't move it into place 107 | os.RemoveAll(tmpPath) 108 | }() 109 | 110 | // Create an initial generation directory for state version 0 111 | genPath := filepath.Join(tmpPath, fmt.Sprintf(genFormat, 0)) 112 | err = os.Mkdir(genPath, 0777) 113 | if err != nil { 114 | return err 115 | } 116 | 117 | // Create an initial state version 0 with the empty string as its value 118 | err = writeVerFile(genPath, fmt.Sprintf(verFormat, 0), "", "") 119 | if err != nil { 120 | return err 121 | } 122 | 123 | // Atomically move the temporary version state directory into place. 124 | err = os.Rename(tmpPath, path) 125 | if err != nil && (excl || !IsExist(err)) { 126 | return err 127 | } 128 | 129 | // Finally, load our cache from the state directory. 130 | return st.refresh() 131 | } 132 | 133 | // Refresh our cached state in attempt to "catch up" to the 134 | // latest register version on the file system. 135 | // Of course the file system may be a constantly-moving target 136 | // so the refreshed state could be stale again immediately on return. 137 | func (st *State) refresh() error { 138 | 139 | // First find the highest-numbered state generation subdirectory 140 | genver, genname, _, err := scan(st.path, genFormat, 0) 141 | if err != nil { 142 | return err 143 | } 144 | 145 | // Then find the highest-numbered register version in that subdirectory 146 | genpath := filepath.Join(st.path, genname) 147 | regver, regname, _, err := scan(genpath, verFormat, 0) 148 | if err != nil { 149 | return err 150 | } 151 | 152 | // Read that highest register version file 153 | val, _, err := readVerFile(genpath, regname) 154 | if err != nil { 155 | return err 156 | } 157 | 158 | st.genVer = genver 159 | st.genPath = genpath 160 | 161 | st.ver = regver 162 | st.val = val 163 | 164 | return nil 165 | } 166 | 167 | // Scan a directory for highest-numbered file or subdirectory matching format. 168 | // If upTo > 0, returns the highest-numbered version no higher than upTo. 169 | func scan(path, format string, upTo int64) ( 170 | maxver int64, maxname string, names []string, err error) { 171 | 172 | // Scan the verst directory for the highest-numbered subdirectory. 173 | dir, err := os.Open(path) 174 | if err != nil { 175 | return 0, "", nil, err 176 | } 177 | info, err := dir.Readdir(0) 178 | if err != nil { 179 | return 0, "", nil, err 180 | } 181 | 182 | // Find the highest-numbered generation subdirectory 183 | maxver = -1 184 | for i := range info { 185 | name := info[i].Name() 186 | 187 | // Scan the version number embedded in the name, if any, 188 | // and confirm that the filename exactly matches the format. 189 | var ver int64 190 | n, err := fmt.Sscanf(name, format, &ver) 191 | if n < 1 || err != nil || name != fmt.Sprintf(format, ver) { 192 | continue 193 | } 194 | 195 | // Find the highest extant version number 196 | // (no greater than upTo, if upTo is nonzero) 197 | if ver > maxver && (upTo == 0 || ver <= upTo) { 198 | maxver, maxname = ver, name 199 | } 200 | 201 | // If upTo is nonzero, collect all the matching names. 202 | if upTo > 0 && ver <= upTo { 203 | names = append(names, name) 204 | } 205 | } 206 | if maxver < 0 { // No highest version!? oops 207 | return 0, "", nil, os.ErrNotExist 208 | } 209 | return 210 | } 211 | 212 | // Read and parse the register version file at regpath. 213 | func readVerFile(genPath, verName string) (val, nextGen string, err error) { 214 | 215 | regPath := filepath.Join(genPath, verName) 216 | b, err := ioutil.ReadFile(regPath) 217 | if err != nil { 218 | return "", "", err 219 | } 220 | 221 | // The encoded value is always first and not optional 222 | rb, b, err := cbe.Decode(b) 223 | if err != nil { 224 | println("corrupt verst version file " + regPath) 225 | return "", "", err 226 | } 227 | 228 | // The encoded next-generation directory name is optional 229 | nxg, b, err := cbe.Decode(b) 230 | // (ignore decoding errors) 231 | 232 | return string(rb), string(nxg), nil 233 | } 234 | 235 | // Read the latest version of the stored state, 236 | // returning both the highest version number (key) and associated value. 237 | // Of course a new version might be written at any time, 238 | // so the caller must assume this information could become stale immediately. 239 | func (st *State) ReadLatest() (ver int64, val string, err error) { 240 | 241 | if err := st.refresh(); err != nil { 242 | return 0, "", err 243 | } 244 | return st.ver, st.val, nil 245 | } 246 | 247 | // Read a specific version of the stored state, 248 | // returning the associated value if possible. 249 | // Returns ErrNotExist if the specified version does not exist, 250 | // either because it has never been written or because it has been expired. 251 | func (st *State) ReadVersion(ver int64) (val string, err error) { 252 | 253 | // In the common case of reading back the last-written version, 254 | // just return its value from our cache. 255 | if ver == st.ver { 256 | return st.val, nil 257 | } 258 | 259 | // Find and read the appropriate version file 260 | val, err = st.readUncached(ver) 261 | if err != nil { 262 | return "", err 263 | } 264 | 265 | // Update our cached state as appropriate. 266 | if ver > st.ver { 267 | st.ver = ver 268 | st.val = val 269 | } 270 | 271 | return val, nil 272 | } 273 | 274 | func (st *State) readUncached(ver int64) (val string, err error) { 275 | 276 | // Optimize for sequential reads of the "next" version 277 | verName := fmt.Sprintf(verFormat, ver) 278 | if ver >= st.genVer { 279 | val, _, err := readVerFile(st.genPath, verName) 280 | if err == nil { 281 | return val, nil // success 282 | } 283 | if !IsNotExist(err) { 284 | return "", err // error other than non-existent 285 | } 286 | } 287 | 288 | // Fallback: scan for the generation containing requested version. 289 | //println("readUncached: fallback at", ver) 290 | genVer, genName, _, err := scan(st.path, genFormat, ver) 291 | if err != nil { 292 | return "", err 293 | } 294 | //println("readUncached: found", ver, "in gen", genVer) 295 | 296 | // The requested version should be in directory genName if it exists. 297 | genPath := filepath.Join(st.path, genName) 298 | val, _, err = readVerFile(genPath, verName) 299 | if err != nil { 300 | return "", err 301 | } 302 | 303 | // Update our cached generation state 304 | if ver >= st.ver { 305 | println("moving to generation", genVer, "at ver", ver) 306 | st.genVer = genVer 307 | st.genPath = genPath 308 | } 309 | 310 | return val, err 311 | } 312 | 313 | // Write version ver with associated value val if ver is not yet written. 314 | // The caller may skip version numbers, e.g., to catch up a delayed store, 315 | // but must never try to (re-)write older versions up to the last written. 316 | // 317 | func (st *State) WriteVersion(ver int64, val string) (err error) { 318 | 319 | if ver <= st.ver { 320 | return ErrExist 321 | } 322 | verName := fmt.Sprintf(verFormat, ver) 323 | 324 | // Should this register version start a new generation? 325 | tmpGenName := "" 326 | if ver%versPerGen == 0 { 327 | 328 | // Prepare the new generation in a temporary directory first 329 | pattern := fmt.Sprintf(genFormat+"-*.tmp", ver) 330 | tmpPath, err := ioutil.TempDir(st.path, pattern) 331 | if err != nil { 332 | return err 333 | } 334 | defer func() { 335 | os.RemoveAll(tmpPath) 336 | }() 337 | tmpGenName = filepath.Base(tmpPath) 338 | 339 | // Write the new register version in the new directory (too) 340 | err = writeVerFile(tmpPath, verName, val, tmpGenName) 341 | if err != nil { 342 | return err 343 | } 344 | } 345 | 346 | // Write version into the (old) generation directory 347 | err = writeVerFile(st.genPath, verName, val, tmpGenName) 348 | if err != nil && !IsExist(err) { 349 | return err 350 | } 351 | 352 | // Read back whatever register version file actually got written, 353 | // which might be from someone else's write that won over ours. 354 | val, tmpGenName, err = readVerFile(st.genPath, verName) 355 | if err != nil { 356 | return err 357 | } 358 | 359 | // If the (actual) new version indicates a new generation directory, 360 | // try to move the temporary directory into its place. 361 | // It's harmless if multiple writers attempt this redundantly: 362 | // it fails if either the old temporary directory no longer exists 363 | // or if a directory with the new name already exists. 364 | if tmpGenName != "" { 365 | oldGenPath := filepath.Join(st.path, tmpGenName) 366 | newGenPath := filepath.Join(st.path, 367 | fmt.Sprintf(genFormat, ver)) 368 | err := os.Rename(oldGenPath, newGenPath) 369 | if err != nil && !IsExist(err) && !IsNotExist(err) { 370 | return err 371 | } 372 | 373 | // It's a good time to expire old generations when feasible 374 | st.expireOld() 375 | 376 | // Update our cached generation state 377 | st.genVer = ver 378 | st.genPath = newGenPath 379 | } 380 | 381 | // Update our cached version state 382 | st.ver = ver 383 | st.val = val 384 | return nil 385 | } 386 | 387 | func writeVerFile(genPath, verName, val, nextGen string) error { 388 | 389 | // Encode the new register version file 390 | b := cbe.Encode(nil, []byte(val)) 391 | b = cbe.Encode(b, []byte(nextGen)) 392 | 393 | // Write it atomically 394 | verPath := filepath.Join(genPath, verName) 395 | if err := atomic.WriteFileOnce(verPath, b, 0644); err != nil { 396 | return err 397 | } 398 | 399 | return nil 400 | } 401 | 402 | // Expire indicates that state versions earlier than before may be deleted. 403 | // It does not necessarily delete these older versions immediately, however. 404 | // Attempts either to read or to write expired versions will fail. 405 | // 406 | func (st *State) Expire(before int64) { 407 | if st.expVer < before { 408 | st.expVer = before 409 | } 410 | } 411 | 412 | // Actually try to delete expired versions. 413 | // We do this only about once per generation for efficiency. 414 | func (st *State) expireOld() { 415 | 416 | // Find all existing generation directories up to version 'before' 417 | maxVer, maxName, names, err := scan(st.path, genFormat, st.expVer) 418 | if err != nil || len(names) == 0 { 419 | return // ignore errors, e.g., no expired generations 420 | } 421 | if maxVer < 0 || maxVer > st.expVer { 422 | println("expireOld oops", len(names), maxVer, st.expVer) 423 | panic("shouldn't happen") 424 | } 425 | 426 | // Delete all generation directories before maxVer, 427 | // since those can only contain versions strictly before maxVer. 428 | for _, genName := range names { 429 | if genName != maxName { 430 | genPath := filepath.Join(st.path, genName) 431 | atomicRemoveAll(genPath) 432 | } 433 | } 434 | } 435 | 436 | // Atomically remove the directory at path, 437 | // ensuring that no one sees inconsistent states within it, 438 | // by renaming it before starting to delete its contents. 439 | func atomicRemoveAll(path string) error { 440 | 441 | tmpPath := fmt.Sprintf("%s.old", path) 442 | if err := os.Rename(path, tmpPath); err != nil { 443 | return err 444 | } 445 | 446 | return os.RemoveAll(tmpPath) 447 | } 448 | 449 | // State.Write returns an error matching this predicate 450 | // when the version the caller asked to write already exists. 451 | func IsExist(err error) bool { 452 | return os.IsExist(err) 453 | } 454 | 455 | // State.Read returns an error matching this predicat 456 | // when the version the caller asked to read does not exist. 457 | func IsNotExist(err error) bool { 458 | return os.IsNotExist(err) 459 | } 460 | 461 | var ErrExist = os.ErrExist 462 | var ErrNotExist = os.ErrNotExist 463 | -------------------------------------------------------------------------------- /go/model/README.md: -------------------------------------------------------------------------------- 1 | This Go package provides a minimal implementation of 2 | Que Sera Consensus (QSC) built on Threshold Logical Clocks (TLC) 3 | for fail-stop, non-Byzantine environments. 4 | For background information on QSC and TLC, 5 | and other model implementations in several languages, please see the 6 | [top level of this repository](https://github.com/dedis/tlc/). 7 | For more details on this package see the code and its 8 | [GoDoc documentation](https://godoc.org/github.com/dedis/tlc/go/model). 9 | -------------------------------------------------------------------------------- /go/model/doc.go: -------------------------------------------------------------------------------- 1 | // Package model implements a simple pedagogic model of TLC and QSC. 2 | // It uses no cryptography and supports only failstop, non-Byzantine consensus, 3 | // but should be usable in scenarios that would typically employ Paxos or Raft. 4 | // 5 | // This implementation is less than 200 lines of actual code as counted by CLOC, 6 | // so a good way to understand it is to read the code directly at 7 | // https://github.com/dedis/tlc/tree/master/go/model. 8 | // You can test this implementation in a variety of consensus configurations 9 | // using only goroutines and channels for communication via: 10 | // 11 | // go test -v 12 | // 13 | // To read about the principles underlying TLC and QSC, please refer to 14 | // https://arxiv.org/abs/1907.07010. 15 | // For a high-level overview of the different implementations of TLC/QSC 16 | // in different languages that live in this repository, please see 17 | // https://github.com/dedis/tlc/. 18 | // 19 | // Configuring and launching consensus groups 20 | // 21 | // To use this implementation of QSC, 22 | // a user of this package must first configure and launch 23 | // a threshold group of nodes. 24 | // This package handles only the core consensus logic, 25 | // leaving matters such as node configuration, network names, connections, 26 | // and wire-format marshaling and unmarshaling to the client of this package. 27 | // 28 | // The client using this package 29 | // must assign each node a unique number from 0 through nnode-1, 30 | // e.g., by configuring the group with a well-known ordering of its members. 31 | // Only node numbers are important to this package; it is oblivious to names. 32 | // 33 | // When each node in the consensus group starts, 34 | // the client calls NewNode to initialize the node's TLC and QSC state. 35 | // The client may then change optional Node configuration parameters, 36 | // such as Node.Rand, before actually commencing protocol message processing. 37 | // The client then calls Node.Advance to launch TLC and the consensus protocol, 38 | // advance to time-step zero, and broadcast a proposal for this time-step. 39 | // Thereafter, the protocol self-clocks asynchronously using TLC 40 | // based on network communication. 41 | // 42 | // Consensus protocol operation and results 43 | // 44 | // This package implements QSC in pipelined fashion, which means that 45 | // a sliding window of three concurrent QSC rounds is active at any time. 46 | // At the start of any given time step s when Advance broadcasts a Raw message, 47 | // this event initiates a new consensus round starting at s and ending at s+3, 48 | // and (in the steady state) completes a consensus round that started at s-3. 49 | // Each Message a node broadcasts includes QSC state from four rounds: 50 | // Message.QSC[0] holds the results of the consensus round just completed, 51 | // while QSC[1] through QSC[3] hold the state of the three still-active rounds, 52 | // with QSC[3] being the newest round just launched. 53 | // 54 | // If Message.QSC[0].Commit is true in the Raw message commencing a time-step, 55 | // then this node saw the round ending at step Message.Step as fully committed. 56 | // In this case, all nodes will have agreed on the same proposal in that round, 57 | // which is the proposal made by node number Message.QSC[0].Conf.From. 58 | // If the client was waiting for a particular transaction to be ordered 59 | // or definitely committed/aborted according to the client's transaction rules, 60 | // then seeing that Message.QSC[0].Commit is true means that the client may 61 | // resolve the status of transactions proposed up to Message.Step-3. 62 | // Other nodes might not have observed this same round as committed, however, 63 | // so the client must not assume that other nodes also necessarily be aware 64 | // that this consensus round successfully committed. 65 | // 66 | // If Message.QSC[0].Commit is false, the round may or may not have converged: 67 | // this node simply cannot determine conclusively whether the round converged. 68 | // Other nodes might have chosen different "best confirmed" proposals, 69 | // as indicated in their respective QSC[0].Conf.From broadcasts for this step. 70 | // Alternatively, the round may in fact have converged, 71 | // and other nodes might observe that fact, even though this node did not. 72 | // 73 | // Message transmission, marshaling 74 | // 75 | // This package invokes the send function provided to NewNode to send messages, 76 | // leaving any wire-format marshaling required to the provided function. 77 | // This allows the client complete control over the desired wire format, 78 | // and to include other information beyond the fields defined in Message, 79 | // such as any semantic content on which the client wishes to achieve consensus. 80 | // On receipt of a message from another node, 81 | // the client must unmarshal it as appropriate 82 | // and invoke Node.Receive with the unmarshalled Message. 83 | // 84 | // Concurrency control 85 | // 86 | // The consensus protocol logic in this package is not thread safe: 87 | // it must be run in a single goroutine, 88 | // or else the client must implement appropriate locking. 89 | // 90 | package model 91 | -------------------------------------------------------------------------------- /go/model/model_test.go: -------------------------------------------------------------------------------- 1 | package model 2 | 3 | import ( 4 | "fmt" 5 | "math/rand" 6 | "sync" 7 | "testing" 8 | ) 9 | 10 | func (n *Node) run(maxSteps int, peer []chan *Message, wg *sync.WaitGroup) { 11 | 12 | // broadcast message for initial time step s=0 13 | n.Advance() // broadcast message for initial time step 14 | 15 | // run the required number of time steps for the test 16 | for n.m.Step < maxSteps { 17 | msg := <-peer[n.m.From] // Receive a message 18 | n.Receive(msg) // Process it 19 | } 20 | 21 | // signal that we're done 22 | wg.Done() 23 | } 24 | 25 | // Run a consensus test case with the specified parameters. 26 | func testRun(t *testing.T, thres, nnode, maxSteps, maxTicket int) { 27 | if maxTicket == 0 { // Default to moderate-entropy tickets 28 | maxTicket = 10 * nnode 29 | } 30 | desc := fmt.Sprintf("T=%v,N=%v,Steps=%v,Tickets=%v", 31 | thres, nnode, maxSteps, maxTicket) 32 | t.Run(desc, func(t *testing.T) { 33 | all := make([]*Node, nnode) 34 | peer := make([]chan *Message, nnode) 35 | send := func(dst int, msg *Message) { peer[dst] <- msg } 36 | 37 | for i := range all { // Initialize all the nodes 38 | peer[i] = make(chan *Message, 3*nnode*maxSteps) 39 | all[i] = NewNode(i, thres, nnode, send) 40 | if maxTicket > 0 { 41 | all[i].Rand = func() int64 { 42 | return rand.Int63n(int64(maxTicket)) 43 | } 44 | } 45 | } 46 | wg := &sync.WaitGroup{} 47 | for _, n := range all { // Run the nodes on separate goroutines 48 | wg.Add(1) 49 | go n.run(maxSteps, peer, wg) 50 | } 51 | wg.Wait() 52 | testResults(t, all) // Report test results 53 | }) 54 | } 55 | 56 | // Dump the consensus state of node n in round s 57 | func (n *Node) testDump(t *testing.T, s, nnode int) { 58 | r := &n.m.QSC[s] 59 | t.Errorf("%v %v conf %v %v re %v %v spoil %v %v", 60 | n.m.From, s, r.Conf.From, r.Conf.Tkt, 61 | r.Reconf.From, r.Reconf.Tkt, r.Spoil.From, r.Spoil.Tkt) 62 | } 63 | 64 | // Globally sanity-check and summarize each node's observed results. 65 | func testResults(t *testing.T, all []*Node) { 66 | for i, ni := range all { 67 | commits := 0 68 | for s, si := range ni.m.QSC { 69 | if si.Commit { 70 | commits++ 71 | for _, nj := range all { // verify consensus 72 | if nj.m.QSC[s].Conf.From != 73 | si.Conf.From { 74 | 75 | t.Errorf("%v %v UNSAFE", i, s) 76 | ni.testDump(t, s, len(all)) 77 | nj.testDump(t, s, len(all)) 78 | } 79 | } 80 | } 81 | } 82 | t.Logf("node %v committed %v of %v (%v%% success rate)", 83 | i, commits, len(ni.m.QSC), (commits*100)/len(ni.m.QSC)) 84 | } 85 | } 86 | 87 | // Run QSC consensus for a variety of test cases. 88 | func TestQSC(t *testing.T) { 89 | testRun(t, 1, 1, 100000, 0) // Trivial case: 1 of 1 consensus! 90 | testRun(t, 2, 2, 100000, 0) // Another trivial case: 2 of 2 91 | 92 | testRun(t, 2, 3, 100000, 0) // Standard f=1 case 93 | testRun(t, 3, 5, 100000, 0) // Standard f=2 case 94 | testRun(t, 4, 7, 10000, 0) // Standard f=3 case 95 | testRun(t, 5, 9, 10000, 0) // Standard f=4 case 96 | testRun(t, 11, 21, 10000, 0) // Standard f=10 case 97 | 98 | testRun(t, 3, 3, 100000, 0) // Larger-than-minimum thresholds 99 | testRun(t, 6, 7, 10000, 0) 100 | testRun(t, 9, 10, 10000, 0) 101 | 102 | // Test with low-entropy tickets: hurts commit rate, but still safe! 103 | testRun(t, 2, 3, 100000, 1) // Limit case: will never commit 104 | testRun(t, 2, 3, 100000, 2) // Extreme low-entropy: rarely commits 105 | testRun(t, 2, 3, 100000, 3) // A bit better bit still bad... 106 | } 107 | -------------------------------------------------------------------------------- /go/model/node.go: -------------------------------------------------------------------------------- 1 | package model 2 | 3 | import "math/rand" 4 | 5 | // Type represents the type of a QSC message: either Raw, Ack, or Wit. 6 | // 7 | // At the start of each time step a node broadcasts a Raw message, 8 | // which proposes a block for the consensus round starting at this step 9 | // and solicits witness acknowledgments to the proposal. 10 | // 11 | // Nodes that receive a Raw message during the same time step 12 | // reply with a unicast Ack message to Raw message's sender, 13 | // acknowledging that they have seen the sender's proposal 14 | // and merged in its QSC state. 15 | // 16 | // Once a node has received a threshold of Ack messages to its Raw proposal, 17 | // the node broadcasts a Wit message to announce that its proposal is witnessed. 18 | // Nodes wait to collect a threshold of Wit messages as their condition 19 | // to advance to the next time step and broadcast their next Raw message. 20 | // 21 | type Type int 22 | 23 | const ( 24 | // Raw unwitnessed QSC proposal 25 | Raw Type = iota 26 | // Ack is the acknowledgment of a proposal 27 | Ack 28 | // Wit is the threshold witness confirmation of a proposal 29 | Wit 30 | ) 31 | 32 | // Message contains the information nodes must pass in messages 33 | // both to run the TLC clocking protocol and achieve QSC consensus. 34 | // 35 | // This implementation of QSC performs no message marshaling or unmarshalling; 36 | // the client using it must handle message wire-format serialization. 37 | // However, the Message struct is defined so as to be compatible with 38 | // standard Go encoders such as encoding/gob or encoding/json. 39 | // The client may also marshal/unmarshal its own larger message struct 40 | // containing a superset of the information here, 41 | // such as to attach semantic content in some form to consensus proposals. 42 | type Message struct { 43 | From int // Node number of node that sent this message 44 | Step int // Logical time step this message is for 45 | Type Type // Message type: Prop, Ack, or Wit 46 | Tkt uint64 // Genetic fitness ticket for consensus 47 | QSC []Round // QSC consensus state for rounds ending at Step or later 48 | } 49 | 50 | // Node contains per-node state and configuration for TLC and QSC. 51 | // Use NewNode to create and properly initialize an instance 52 | // with the mandatory configuration parameters. 53 | // Public fields in this struct are optional configuration settings, 54 | // which NewNode initializes to defaults but the caller may change 55 | // after calling NewNode but before commencing protocol execution. 56 | // 57 | // Consensus uses the configurable Rand function to choose "genetic fitness" 58 | // lottery tickets for each node's proposal in each round. 59 | // Only the low 63 bits of the returned int64 are used. 60 | // This defaults to using the system's math/rand.Int63(). 61 | // To tolerate sophisticated network denial-of-service attackers, 62 | // a full implementation should use cryptographic randomness 63 | // and hide the tickets from the network using encryption (e.g., TLS). 64 | // 65 | // The Rand function must not be changed once the Node is in operation. 66 | // All nodes must use the same nonnegative random number distribution. 67 | // Ticket collisions are not a problem as long as they are rare, 68 | // which is why 63 bits of entropy is sufficient. 69 | // 70 | type Node struct { 71 | m Message // Template for messages we send 72 | 73 | thres int // TLC message and witness thresholds 74 | nnode int // Total number of nodes 75 | send func(peer int, msg *Message) // Function to send message to a peer 76 | 77 | acks int // # acknowledgments we've received in this step 78 | wits int // # threshold witnessed messages seen this step 79 | 80 | Rand func() int64 // Function to generate random genetic fitness tickets 81 | } 82 | 83 | // NewNode creates and initializes a new Node with the specified group configuration. 84 | // The parameters to NewNode are the mandatory Node configuration parameters: 85 | // self is this node's number, thres is the TLC message and witness threshold, 86 | // nnode is the total number of nodes, 87 | // and send is a function to send a Message to a given peer node number. 88 | // 89 | // Optional configuration is represented by fields in the created Node struct, 90 | // which the caller may modify before commencing the consensus protocol. 91 | // 92 | func NewNode(self, thres, nnode int, send func(peer int, msg *Message)) (n *Node) { 93 | return &Node{ 94 | m: Message{From: self, Step: -1, 95 | QSC: make([]Round, 3)}, // "rounds" ending in steps 0-2 96 | thres: thres, nnode: nnode, send: send, 97 | Rand: rand.Int63} 98 | } 99 | -------------------------------------------------------------------------------- /go/model/qsc.go: -------------------------------------------------------------------------------- 1 | package model 2 | 3 | // Best is a record representing either a best confirmed proposal, 4 | // or a best potential spoiler competing with the best confirmed proposal, 5 | // used in the Round struct. 6 | // 7 | // In each case, the only information we really need is 8 | // the genetic fitness lottery ticket of the "best" proposal seen so far, 9 | // and which node produced that proposal. 10 | // This optimization works only in the non-Byzantine QSC consensus protocol, 11 | // because Byzantine consensus requires that the lottery tickets be 12 | // unknown and unbiasable to everyone until the consensus round completes. 13 | // 14 | // When we're collecting the best potential spoiler proposal - 15 | // the proposal with the highest ticket regardless of whether it's confirmed - 16 | // we must keep track of ticket collisions, 17 | // in case one colliding proposal might "win" if not spoiled by the other. 18 | // When we detect a spoiler collision, we simply set From to -1, 19 | // an invalid node number that will be unequal to, and hence properly "spoil", 20 | // a confirmed or reconfirmed proposal with the same ticket from any node. 21 | // 22 | type Best struct { 23 | From int // Node the proposal is from (spoiler: -1 for tied tickets) 24 | Tkt uint64 // Proposal's genetic fitness ticket 25 | } 26 | 27 | // Find the Best of two records primarily according to highest ticket number. 28 | // For spoilers, detect and record ticket collisions with invalid node number. 29 | func (b *Best) merge(o *Best, spoiler bool) { 30 | if o.Tkt > b.Tkt { 31 | *b = *o // strictly better ticket 32 | } else if o.Tkt == b.Tkt && o.From != b.From && spoiler { 33 | b.From = -1 // record ticket collision 34 | } 35 | } 36 | 37 | // Round encapsulates all the QSC state needed for one consensus round: 38 | // the best potential "spoiler" proposal regardless of confirmation status, 39 | // the best confirmed (witnessed) proposal we've seen so far in the round, 40 | // and the best reconfirmed (double-witnessed) proposal we've seen so far. 41 | // Finally, at the end of the round, we set Commit to true if 42 | // the best confirmed proposal in Conf has definitely been committed. 43 | type Round struct { 44 | Spoil Best // Best potential spoiler(s) we've found so far 45 | Conf Best // Best confirmed proposal we've found so far 46 | Reconf Best // Best reconfirmed proposal we've found so far 47 | Commit bool // Whether we confirm this round successfully committed 48 | } 49 | 50 | // Merge QSC round info from an incoming message into our round history 51 | func mergeQSC(b, o []Round) { 52 | for i := range b { 53 | b[i].Spoil.merge(&o[i].Spoil, true) 54 | b[i].Conf.merge(&o[i].Conf, false) 55 | b[i].Reconf.merge(&o[i].Reconf, false) 56 | } 57 | } 58 | 59 | // The TLC layer upcalls this method on advancing to a new time-step, 60 | // with sets of proposals recently seen (saw) and threshold witnessed (wit). 61 | func (n *Node) advanceQSC() { 62 | 63 | // Choose a fresh genetic fitness ticket for this proposal 64 | n.m.Tkt = uint64(n.Rand()) | (1 << 63) // Ensure it's greater than zero 65 | 66 | // Initialize consensus state for the round starting at step. 67 | // Find best spoiler, breaking ticket ties in favor of higher node 68 | newRound := Round{Spoil: Best{From: n.m.From, Tkt: n.m.Tkt}} 69 | n.m.QSC = append(n.m.QSC, newRound) 70 | 71 | // Decide if the just-completed consensus round successfully committed. 72 | r := &n.m.QSC[n.m.Step] 73 | r.Commit = r.Conf.From == r.Reconf.From && r.Conf.From == r.Spoil.From 74 | } 75 | 76 | // TLC layer upcalls this to inform us that our proposal is threshold witnessed 77 | func (n *Node) witnessedQSC() { 78 | 79 | // Our proposal is now confirmed in the consensus round just starting 80 | // Find best confirmed proposal, breaking ties in favor of lower node 81 | myBest := &Best{From: n.m.From, Tkt: n.m.Tkt} 82 | n.m.QSC[n.m.Step+3].Conf.merge(myBest, false) 83 | 84 | // Find reconfirmed proposals for the consensus round that's in step 1 85 | n.m.QSC[n.m.Step+2].Reconf.merge(&n.m.QSC[n.m.Step+2].Conf, false) 86 | } 87 | -------------------------------------------------------------------------------- /go/model/qscod/README.md: -------------------------------------------------------------------------------- 1 | This Go package provides a minimal implementation of 2 | Que Sera Consensus (QSC) built on Threshold Logical Clocks (TLC) 3 | for fail-stop, non-Byzantine environments. 4 | For background information on QSC and TLC, 5 | and other model implementations in several languages, please see the 6 | [top level of this repository](https://github.com/dedis/tlc/). 7 | For more details on this package see the code and its 8 | [GoDoc documentation](https://godoc.org/github.com/dedis/tlc/go/model/qscod). 9 | -------------------------------------------------------------------------------- /go/model/qscod/core/cli.go: -------------------------------------------------------------------------------- 1 | // Package core implements the minimal core of the QSCOD consensus algorithm. 2 | // for client-driven "on-demand" consensus. 3 | // 4 | // This implementation of QSCOD builds on the TLCB and TLCR 5 | // threshold logical clock algorithms. 6 | // These algorithms are extremely simple but do impose one constraint: 7 | // the number of failing nodes must be at most one-third the group size. 8 | // 9 | // The unit tests for this package is in the test sub-package, 10 | // so that useful test framework code can be shared with other packages 11 | // without requiring any of it to be imported into development builds. 12 | // (Too bad Go doesn't allow packages to export and import test code.) 13 | // 14 | package core 15 | 16 | //import "fmt" 17 | import "sync" 18 | import "context" 19 | 20 | // Store represents an interface to one of the n key/value stores 21 | // representing the persistent state of each of the n consensus group members. 22 | // A Store's keys are integer TLC time-steps, 23 | // and its values are Value structures. 24 | // 25 | // WriteRead(step, value) attempts to write tv to the store at step v.S, 26 | // returning the first value written by any client. 27 | // WriteRead may also return a value from any higher time-step, 28 | // if other clients have moved the store's state beyond v.S. 29 | // 30 | // This interface intentionally provides no means to return an error. 31 | // If WriteRead encounters an error that might be temporary or recoverable, 32 | // then it should just keep trying (perhaps with appropriate backoff). 33 | // This is the fundamental idea of asynchronous fault tolerant consensus: 34 | // to tolerate individual storage node faults, persistently without giving up, 35 | // waiting for as long as it takes for the store to become available again. 36 | // 37 | // If the application encounters an error that warrants a true global failure, 38 | // then it should arrange for the Up function to return an error, 39 | // which will eventually cause all the worker threads to terminate. 40 | // In this case, the application can cancel any active WriteRead calls, 41 | // which may simply return the value v that was requested to be written 42 | // in order to allow the per-node worker thread to terminate cleanly. 43 | // 44 | type Store interface { 45 | WriteRead(v Value) Value 46 | } 47 | 48 | // Value represents the values that a consensus node's key/value Store maps to. 49 | type Value struct { 50 | S int64 // TLC step number this broadcast value is for 51 | P string // Application data string for this proposal 52 | I int64 // Random integer priority for this proposal 53 | R, B Set // Read set and broadcast set from TLCB 54 | } 55 | 56 | // Set represents a set of proposed values from the same time-step, 57 | // indexed by integer node numbers. 58 | type Set map[int]Value 59 | 60 | // best returns some maximum-priority Value in a Set, 61 | // together with a flag indicating whether the returned history 62 | // is uniquely the best, i.e., the set contains no history tied for best. 63 | func (S Set) best() (bn int, bv Value, bu bool) { 64 | for n, v := range S { 65 | if v.I >= bv.I { 66 | // A new best value is unique (so far) 67 | // if its priority is strictly higher than the last, 68 | // or if it has equal priority, was unique so far, 69 | // and is proposing identical application data. 70 | bn, bv, bu = n, v, v.I > bv.I || (bu && v.P == bv.P) 71 | } 72 | } 73 | return bn, bv, bu 74 | } 75 | 76 | // Client represents a logical client that can propose transactions 77 | // to the consensus group and drive the QSC/TLC state machine forward 78 | // asynchronously across the key/value storesu defining the group's state. 79 | // 80 | // The caller must initialize the public variables 81 | // to represent a valid QSCOD configuration, 82 | // before invoking Client.Run to run the consensus algorithm. 83 | // The public configuration variables must not be changed 84 | // after starting the client. 85 | // 86 | // KV is a slice containing interfaces to each of the key/value stores 87 | // that hold the persistent state of each node in the consensus group. 88 | // The total number of nodes N is defined to be len(KV). 89 | // 90 | // Tr and Ts are the receive and spread thresholds, respectively. 91 | // To ensure liveness against up to F slow or crashed nodes, 92 | // the receive threshold must exclude the F failed nodes: i.e., Tr <= N-F. 93 | // To ensure consistency (safety), the constrant Tr+Ts > N must hold. 94 | // Finally, to ensure that each round enjoys a high probability 95 | // of successful commitment, it should be the case that N >= 3F. 96 | // Thus, given F and N >= 3F, it is safe to set Tr = N-F and Ts = N-Tr+1. 97 | // The precise minimum threshold requirements are slightly more subtle, 98 | // but this is a safe and simpler configuration rule. 99 | // 100 | // Up is a callback function that the Client calls regularly while running, 101 | // to update the caller's knowledge of committed transactions 102 | // and to update the proposal data the client attempts to commit. 103 | // Client passes to Up the step numbers and proposal Data strings 104 | // for the last (predecessor) and current states known to be committed. 105 | // This known-committed proposal will change regularly across calls, 106 | // but may not change on each call and may not even be monotonic. 107 | // The Up function returns a string representing the new preferred 108 | // proposal Data that the Client will subsequently attempt to commit. 109 | // The Up function also returns an error which, if non-nil, 110 | // causes the Client's operation to terminate and return that error. 111 | // 112 | // RV is a function to generate non-negatative random numbers 113 | // for the symmetry-breaking priority values QSCOD requires. 114 | // In a production system, these random numbers should have high entropy 115 | // for maximum performance (minimum likelihood of collisions), 116 | // and should be generated from a cryptographically strong private source 117 | // for maximum protection against denial-of-service attacks in the network. 118 | // 119 | type Client struct { 120 | KV []Store // Per-node key/value state storage interfaces 121 | Tr, Ts int // Receive and spread threshold configuration 122 | 123 | Pr func(int64, string, bool) (string, int64) // Proposal function 124 | 125 | mut sync.Mutex // Mutex protecting this client's state 126 | } 127 | 128 | type work struct { 129 | cond *sync.Cond // For awaiting threshold conditions 130 | val Value // Value template each worker will try to write 131 | kvc Set // Key/value cache collected for this time-step 132 | max Value // Value with highest time-step we must catch up to 133 | next *work // Forward pointer to next work item 134 | } 135 | 136 | // Run starts a client running with its given configuration parameters, 137 | // proposing transactions and driving the consensus state machine continuously 138 | // forever or until the passed context is cancelled. 139 | // 140 | func (c *Client) Run(ctx context.Context) (err error) { 141 | 142 | // Keep the mutex locked whenever we're not waiting. 143 | c.mut.Lock() 144 | defer c.mut.Unlock() 145 | 146 | // Launch one client thread to drive each of the n consensus nodes. 147 | w := &work{kvc: make(Set), cond: sync.NewCond(&c.mut)} 148 | for i := range c.KV { 149 | go c.worker(i, w) 150 | } 151 | 152 | // Drive consensus state forever or until our context gets cancelled. 153 | for ; ctx.Err() == nil; w = w.next { 154 | 155 | // Wait for a threshold number of worker threads 156 | // to complete the current work-item 157 | for len(w.kvc) < c.Tr { 158 | w.cond.Wait() 159 | } 160 | 161 | //str := fmt.Sprintf("at %v kvc contains:", w.val.S) 162 | //for i, v := range w.kvc { 163 | // str += fmt.Sprintf( 164 | // "\n node %v step %v data %q pri %v R %v B %v", 165 | // i, v.S, v.P, v.I, len(v.R), len(v.B)) 166 | //} 167 | //println(str) 168 | 169 | // Set the next work-item pointer in the current work-item, 170 | // so that the worker threads know there will be a next item. 171 | w.next = &work{kvc: make(Set), cond: sync.NewCond(&c.mut)} 172 | 173 | // Wake up worker threads waiting for a next item to appear 174 | w.cond.Broadcast() 175 | 176 | // Decide on the next step number and value to broadcast, 177 | // based on the threshold set we collected, 178 | // which is now immutable and consistent across threads. 179 | // v := Value{P:Head{Step:w.max.S+1}} 180 | nv := &w.next.val 181 | // v.S = w.max.S+1 182 | nv.S = w.val.S + 1 183 | switch { 184 | 185 | case w.max.S > w.val.S: 186 | 187 | // Some node already reached a higher time-step. 188 | // Our next work item is simply to catch up all nodes 189 | // at least to the highest-known step we discovered. 190 | //println("catching up from", w.val.S, "to", w.max.S) 191 | *nv = w.max 192 | 193 | case (w.val.S & 1) == 0: // finishing even-numbered step 194 | 195 | // Complete the first TLCR broadcast 196 | // and start the second within a TLCB round. 197 | // The value for the second broadcsast is simply 198 | // the threshold receive set from the first. 199 | nv.R = w.kvc 200 | 201 | case (w.val.S & 3) == 1: 202 | 203 | // Complete the first TLCB call in a QSCOD round 204 | // and start the second TLCB call for the round. 205 | 206 | // Calculate valid potential (still tentative) 207 | // R and B sets from the first TLCB call in this round, 208 | // and include them in the second TLCB broadcast. 209 | R0, B0 := c.tlcbRB(w.kvc) 210 | 211 | // Pick any best confirmed proposal from B0 212 | // as our broadcast for the second TLCB round. 213 | _, v2, _ := B0.best() 214 | 215 | // Set the value for the second TLCB call to broadcast 216 | nv.I, nv.R, nv.B = v2.I, R0, B0 217 | 218 | case (w.val.S & 3) == 3: 219 | 220 | // Complete a prior QSCOD round and start a new one. 221 | 222 | // First, calculate valid potential R2 and B2 sets from 223 | // the second TLCB call in the completed QSCOD round. 224 | R2, B2 := c.tlcbRB(w.kvc) 225 | 226 | // We always adopt some best confirmed proposal from R2 227 | // as our own (still tentative so far) view of history. 228 | // If this round successfully commits, 229 | // then our b2 will be the same as everyone else's, 230 | // even if we fail below to realize that fact. 231 | _, b2, _ := R2.best() 232 | 233 | // Find the best-known proposal b0 in some node's R0. 234 | // We can get an R0 set from the first round in b2.R. 235 | // Also determine if b0 was uniquely best in this R0. 236 | // Our R2 and B2 sets will be subsets of any valid R0. 237 | n0, b0, u0 := b2.R.best() 238 | 239 | // See if we can determine b2 to have been committed: 240 | // if b0==b2 is the uniquely-best eligible proposal. 241 | // This test may succeed only for some nodes in a round. 242 | // If b is uniquely-best in R0 we can compare priorities 243 | // to see if two values are the same node's proposal. 244 | // // Never commit proposals that don't change the Data, 245 | // // since we use those to represent "no-op" proposals. 246 | com := u0 && b0.I == b2.I && b0.I == B2[n0].I 247 | if com { 248 | // if u0 && b0.I == b2.I && b0.I == B2[n0].I && 249 | // b0.P.Data != v.C.Data 250 | 251 | // b0.P is the original proposal with data, 252 | // which becomes the new current commit C. 253 | // The previous current commit 254 | // becomes the last commit L. 255 | //println("committed", b0.S, "data", b0.P) 256 | // v.L, v.C = v.C, b0.P 257 | } 258 | 259 | // Set the value for the first TLCB call 260 | // in the next QSCOD round to broadcast, 261 | // containing a proposal for the next round. 262 | nv.P, nv.I = c.Pr(b0.S, b0.P, com) 263 | } 264 | 265 | //fmt.Printf("at %v next step %v pri %v prop %q R %v B %v\n", 266 | // w.val.S, nv.S, nv.I, nv.P, len(nv.R), len(nv.B)) 267 | 268 | //if nv.S < w.max.S { 269 | // println("no progress: s", w.val.S, "lv", w.max.S, 270 | // "to", nv.S) 271 | //} 272 | } 273 | 274 | // Signal the worker threads to terminate with an all-nil work-item 275 | w.next = &work{} 276 | w.cond.Broadcast() 277 | 278 | // Any slow client threads will continue in the background 279 | // until they catch up with the others or successfully get cancelled. 280 | return ctx.Err() 281 | } 282 | 283 | // worker handles a goroutine dedicated to submitting WriteRead requests 284 | // to each consensus group node asynchronously without delaying the main thread. 285 | // 286 | // We could in principle launch a separate goroutine per node each time step, 287 | // which would be even simpler to manage and provide higher parallelism. 288 | // But this would risk creating a ton of outstanding concurrent goroutines 289 | // trying to access the same slow node(s) and overloading those nodes further, 290 | // or creating local resource pressures such as too many open file descriptors 291 | // in case each WriteRead call opens a new file descriptor or socket, etc. 292 | // So we have only one worker per consensus group node do everything serially, 293 | // limiting resource usage while protecting the main thread from slow nodes. 294 | // 295 | func (c *Client) worker(node int, w *work) { 296 | 297 | // Keep Client state locked while we're not waiting 298 | c.mut.Lock() 299 | 300 | // Process work-items defined by the main thread in sequence, 301 | // terminating when we encounter a work-item with a nil kvc. 302 | for ; w.kvc != nil; w = w.next { 303 | 304 | // // Pull the next Value template we're supposed to write 305 | // v := w.val 306 | 307 | // // In steps that start a new QSC round with new proposals, 308 | // // each node gets its own independent random priority 309 | // // even when they're proposals of the same application value. 310 | // if (v.S & 3) == 0 { 311 | // v.I = c.RV() 312 | // } 313 | 314 | //println(w, "before WriteRead step", w.val.S) 315 | 316 | // Try to write new value, then read whatever the winner wrote. 317 | c.mut.Unlock() 318 | v := c.KV[node].WriteRead(w.val) 319 | c.mut.Lock() 320 | 321 | //println(w, "after WriteRead step", w.val.S, "read", v.S) 322 | 323 | //if v.S < w.val.S { 324 | // println("read back value from old step", v.S, w.val.S) 325 | //} 326 | 327 | // Collect a threshold number of last-step values in w.kvc, 328 | // after which work-item w will be considered complete. 329 | // Don't modify kvc or max after reaching the threshold tr, 330 | // because they are expected to be immutable afterwards. 331 | if len(w.kvc) < c.Tr { 332 | 333 | // Record the actual value read in the work-item 334 | w.kvc[node] = v 335 | 336 | // Track the highest last-step value read on any node, 337 | // which may be higher than the one we tried to write 338 | // if we need to catch up with a faster node. 339 | if v.S > w.max.S { 340 | w.max = v 341 | } 342 | 343 | // Wake up the main thread when we reach the threshold 344 | if len(w.kvc) == c.Tr { 345 | w.cond.Broadcast() 346 | } 347 | } 348 | 349 | // Wait until the main thread has created a next work-item. 350 | for w.next == nil { 351 | w.cond.Wait() 352 | } 353 | } 354 | 355 | c.mut.Unlock() 356 | } 357 | 358 | // tlcbRB calculates the receive (R) and broadcast (B) sets 359 | // returned by the TLCB algorithm after its second TLCR call. 360 | // 361 | // The returned R and B sets are only tentative, 362 | // representing possible threshold receive-set and broadcast-set outcomes 363 | // from this TLCB invocation, computed locally by this client. 364 | // These locally-computed sets cannot be relied on to be definite for this node 365 | // until the values computed from them are committed via Store.WriteRead. 366 | // 367 | func (c *Client) tlcbRB(kvc Set) (Set, Set) { 368 | 369 | // Using the tentative client-side receive-set from the second TLCR, 370 | // compute potential receive-set (R) and broadcast-set (B) sets 371 | // to return from TLCB. 372 | R, B, Bc := make(Set), make(Set), make([]int, len(c.KV)) 373 | for _, v := range kvc { 374 | for j, vv := range v.R { 375 | R[j] = vv // R has all values we've seen 376 | Bc[j]++ // How many nodes have seen vv? 377 | if Bc[j] >= c.Ts { // B has only those reaching ts 378 | B[j] = vv 379 | } 380 | } 381 | } 382 | return R, B 383 | } 384 | -------------------------------------------------------------------------------- /go/model/qscod/core/test/cli.go: -------------------------------------------------------------------------------- 1 | // Package test contains shareable code for testing instantiations of QSCOD. 2 | package test 3 | 4 | import ( 5 | "context" 6 | "fmt" 7 | "math/rand" 8 | "sync" 9 | "testing" 10 | 11 | . "github.com/dedis/tlc/go/model/qscod/core" 12 | ) 13 | 14 | // Object to record the common total order and verify it for consistency 15 | type testOrder struct { 16 | hist []string // all history known to be committed so far 17 | mut sync.Mutex // mutex protecting this reference order 18 | } 19 | 20 | // When a client reports a history h has been committed, 21 | // record that in the testOrder and check it for global consistency. 22 | func (to *testOrder) committed(t *testing.T, step int64, prop string) { 23 | to.mut.Lock() 24 | defer to.mut.Unlock() 25 | 26 | // Ensure history slice is long enough 27 | for step >= int64(len(to.hist)) { 28 | to.hist = append(to.hist, "") 29 | } 30 | 31 | // Check commit consistency across all concurrent clients 32 | switch { 33 | case to.hist[step] == "": 34 | to.hist[step] = prop 35 | case to.hist[step] != prop: 36 | t.Errorf("Inconsistency at %v:\n old %q\n new %q", 37 | step, to.hist[step], prop) 38 | } 39 | } 40 | 41 | // testCli creates a test client with particular configuration parameters. 42 | func testCli(t *testing.T, self, f, maxstep, maxpri int, 43 | kv []Store, to *testOrder, wg *sync.WaitGroup) { 44 | 45 | // Create a cancelable context for the test run 46 | ctx, cancel := context.WithCancel(context.Background()) 47 | 48 | // Our proposal function simply collects and consistency-checks 49 | // committed Heads until a designated time-step is reached. 50 | pr := func(step int64, cur string, com bool) (string, int64) { 51 | //fmt.Printf("cli %v saw commit %v %q\n", self, C.Step, C.Data) 52 | 53 | // Consistency-check the history h known to be committed 54 | if com { 55 | to.committed(t, step, cur) 56 | } 57 | 58 | // Stop once we reach the step limit 59 | if step >= int64(maxstep) { 60 | cancel() 61 | } 62 | 63 | // Use the simple Int63n for random number generation, 64 | // with values constrained to be lower than maxpri for testing. 65 | // A real deployment should use cryptographic randomness 66 | // and should preferably be high-entropy, 67 | // close to the full 64 bits. 68 | pri := rand.Int63n(int64(maxpri)) 69 | 70 | return fmt.Sprintf("cli %v proposal %v", self, step), pri 71 | } 72 | 73 | // Start the test client with appropriate parameters assuming 74 | // n=3f, tr=2f, tb=f, and ts=f+1, satisfying TLCB's constraints. 75 | c := Client{KV: kv, Tr: 2 * f, Ts: f + 1, Pr: pr} 76 | c.Run(ctx) 77 | 78 | wg.Done() 79 | } 80 | 81 | // Run a consensus test case on a given set of Store interfaces 82 | // and with the specified group configuration and test parameters. 83 | func TestRun(t *testing.T, kv []Store, nfail, ncli, maxstep, maxpri int) { 84 | 85 | // Create a reference total order for safety checking 86 | to := &testOrder{} 87 | 88 | desc := fmt.Sprintf("F=%v,N=%v,Clients=%v,Commits=%v,Tickets=%v", 89 | nfail, len(kv), ncli, maxstep, maxpri) 90 | t.Run(desc, func(t *testing.T) { 91 | 92 | // Simulate the appropriate number of concurrent clients 93 | wg := &sync.WaitGroup{} 94 | for i := 0; i < ncli; i++ { 95 | wg.Add(1) 96 | go testCli(t, i, nfail, maxstep, maxpri, kv, to, wg) 97 | } 98 | wg.Wait() 99 | }) 100 | } 101 | -------------------------------------------------------------------------------- /go/model/qscod/core/test/cli_test.go: -------------------------------------------------------------------------------- 1 | package test 2 | 3 | import ( 4 | "sync" 5 | "testing" 6 | 7 | . "github.com/dedis/tlc/go/model/qscod/core" 8 | ) 9 | 10 | // Trivial intra-process key-value store implementation for testing 11 | type testStore struct { 12 | mut sync.Mutex // synchronization for testStore state 13 | v Value // the latest value written 14 | } 15 | 16 | // WriteRead implements the Store interface with a simple intra-process map. 17 | func (ts *testStore) WriteRead(v Value) Value { 18 | ts.mut.Lock() 19 | defer ts.mut.Unlock() 20 | 21 | // Write value v only if it's newer than the last value written. 22 | if v.S > ts.v.S { 23 | ts.v = v 24 | } 25 | 26 | // Then return whatever was last written, regardless. 27 | return ts.v 28 | } 29 | 30 | // Run a consensus test case with the specified parameters. 31 | func testRun(t *testing.T, nfail, nnode, ncli, maxstep, maxpri int) { 32 | 33 | // Create a simple test key/value store representing each node 34 | kv := make([]Store, nnode) 35 | for i := range kv { 36 | kv[i] = &testStore{} 37 | } 38 | 39 | TestRun(t, kv, nfail, ncli, maxstep, maxpri) 40 | } 41 | 42 | // Test the Client with a trivial in-memory key/value Store implementation. 43 | func TestClient(t *testing.T) { 44 | testRun(t, 1, 3, 1, 100000, 100) // Standard f=1 case 45 | testRun(t, 1, 3, 2, 100000, 100) 46 | testRun(t, 1, 3, 10, 100000, 100) 47 | testRun(t, 1, 3, 20, 100000, 100) 48 | testRun(t, 1, 3, 50, 100000, 100) 49 | testRun(t, 1, 3, 100, 100000, 100) 50 | 51 | testRun(t, 2, 6, 10, 100000, 100) // Standard f=2 case 52 | testRun(t, 3, 9, 10, 100000, 100) // Standard f=3 case 53 | testRun(t, 4, 12, 10, 100000, 100) // Standard f=4 case 54 | testRun(t, 5, 15, 10, 100000, 100) // Standard f=10 case 55 | 56 | // Test with low-entropy tickets: hurts commit rate, but still safe! 57 | testRun(t, 1, 3, 10, 100000, 2) // Extreme low-entropy: rarely commits 58 | testRun(t, 1, 3, 10, 100000, 3) // A bit better bit still bad... 59 | } 60 | -------------------------------------------------------------------------------- /go/model/qscod/encoding/enc.go: -------------------------------------------------------------------------------- 1 | // This package implements serialization of Values for QSCOD. 2 | // It currently just uses GOB encoding for simplicity, 3 | // but we should change that to something not Go-specific. 4 | package encoding 5 | 6 | import ( 7 | "bytes" 8 | "encoding/gob" 9 | 10 | . "github.com/dedis/tlc/go/model/qscod/core" 11 | ) 12 | 13 | // Encode a Value for serialized transmission. 14 | func EncodeValue(v Value) ([]byte, error) { 15 | buf := &bytes.Buffer{} 16 | enc := gob.NewEncoder(buf) 17 | if err := enc.Encode(v); err != nil { 18 | return nil, err 19 | } 20 | return buf.Bytes(), nil 21 | } 22 | 23 | // Decode a Value from its serialized format. 24 | func DecodeValue(b []byte) (v Value, err error) { 25 | r := bytes.NewReader(b) 26 | dec := gob.NewDecoder(r) 27 | err = dec.Decode(&v) 28 | return 29 | } 30 | -------------------------------------------------------------------------------- /go/model/qscod/fs/casdir/cas_test.go: -------------------------------------------------------------------------------- 1 | // Package casdir tests CAS-based QSCOD over a set of file system CAS stores. 2 | package casdir 3 | 4 | import ( 5 | "context" 6 | "fmt" 7 | "os" 8 | "testing" 9 | 10 | . "github.com/dedis/tlc/go/lib/cas" 11 | "github.com/dedis/tlc/go/lib/cas/test" 12 | "github.com/dedis/tlc/go/lib/fs/casdir" 13 | . "github.com/dedis/tlc/go/model/qscod/cas" 14 | ) 15 | 16 | // Run a consensus test case with the specified parameters. 17 | func testRun(t *testing.T, nfail, nnode, nclients, nthreads, naccesses int) { 18 | 19 | // Create a test key/value store representing each node 20 | dirs := make([]string, nnode) 21 | for i := range dirs { 22 | dirs[i] = fmt.Sprintf("test-store-%d", i) 23 | 24 | // Remove the test directory if one is left-over 25 | // from a previous test run. 26 | os.RemoveAll(dirs[i]) 27 | 28 | // Create the test directory afresh. 29 | fs := &casdir.Store{} 30 | if err := fs.Init(dirs[i], true, true); err != nil { 31 | t.Fatal(err) 32 | } 33 | 34 | // Clean it up once the test is done. 35 | defer os.RemoveAll(dirs[i]) 36 | } 37 | 38 | desc := fmt.Sprintf("F=%v,N=%v,Clients=%v,Threads=%v,Accesses=%v", 39 | nfail, nnode, nclients, nthreads, naccesses) 40 | t.Run(desc, func(t *testing.T) { 41 | 42 | // Create a context and cancel it at the end of the test 43 | ctx, cancel := context.WithCancel(context.Background()) 44 | defer cancel() 45 | 46 | // Create simulated clients to access the consensus group 47 | clients := make([]Store, nclients) 48 | for i := range clients { 49 | 50 | // Create a set of Store objects for each client 51 | members := make([]Store, nnode) 52 | for j := range members { 53 | fs := &casdir.Store{} 54 | err := fs.Init(dirs[j], false, false) 55 | if err != nil { 56 | t.Fatal(err) 57 | } 58 | members[j] = fs 59 | } 60 | 61 | clients[i] = (&Group{}).Start(ctx, members, nfail) 62 | } 63 | 64 | // Run a standard torture test across all the clients 65 | test.Stores(t, nthreads, naccesses, clients...) 66 | }) 67 | } 68 | 69 | func TestConsensus(t *testing.T) { 70 | testRun(t, 1, 3, 1, 10, 10) // Standard f=1 case, 71 | testRun(t, 1, 3, 2, 10, 10) // varying number of clients 72 | testRun(t, 1, 3, 10, 10, 10) 73 | testRun(t, 1, 3, 20, 10, 10) 74 | 75 | testRun(t, 2, 6, 10, 10, 10) // Standard f=2 case 76 | testRun(t, 3, 9, 10, 10, 10) // Standard f=3 case 77 | 78 | // Note: when nnode * nclients gets to be around 120-ish, 79 | // we start running into default max-open-file limits. 80 | } 81 | -------------------------------------------------------------------------------- /go/model/qscod/fs/simple/store.go: -------------------------------------------------------------------------------- 1 | // This package provides a simple file system key/value Store for QSCOD, 2 | // with no support for garbage collection. 3 | // It is intended only for education, testing, and experimentation, 4 | // and not for any production use. 5 | // 6 | package simple 7 | 8 | import ( 9 | "context" 10 | "fmt" 11 | "io/ioutil" 12 | "os" 13 | "path/filepath" 14 | 15 | "github.com/dedis/tlc/go/lib/backoff" 16 | "github.com/dedis/tlc/go/lib/fs/atomic" 17 | . "github.com/dedis/tlc/go/model/qscod/core" 18 | "github.com/dedis/tlc/go/model/qscod/encoding" 19 | ) 20 | 21 | // FileStore implements a simple QSCOD key/value store 22 | // as a directory in a file system. 23 | // The caller must create the directory designated by Path. 24 | // 25 | type FileStore struct { 26 | Path string // Directory to contain files representing key/value state 27 | } 28 | 29 | // Attempt to write the value v to a file associated with time-step step, 30 | // then read back whichever value was successfully written first. 31 | // 32 | // This implementation simply panics if any file system error occurs. 33 | // A more robust approach suited to asynchronous consensus would be 34 | // to log the error then retry in an exponential-backoff loop. 35 | // 36 | func (fs *FileStore) WriteRead(v Value) (rv Value) { 37 | 38 | try := func() (err error) { 39 | 40 | // Serialize the proposed value 41 | buf, err := encoding.EncodeValue(v) 42 | if err != nil { 43 | return err 44 | } 45 | 46 | // Try to write the file, ignoring already-exists errors 47 | name := fmt.Sprintf("ver-%d", v.P.Step) 48 | path := filepath.Join(fs.Path, name) 49 | err = atomic.WriteFileOnce(path, buf, 0666) 50 | if err != nil && !os.IsExist(err) { 51 | return err 52 | } 53 | 54 | // Read back whatever file was successfully written first there 55 | rbuf, err := ioutil.ReadFile(path) 56 | if err != nil { 57 | return err 58 | } 59 | 60 | // Deserialize the value read 61 | rv, err = encoding.DecodeValue(rbuf) 62 | if err != nil { 63 | return err 64 | } 65 | 66 | return nil 67 | } 68 | 69 | backoff.Retry(context.Background(), try) 70 | return rv 71 | } 72 | 73 | // QSCOD calls Committed to inform us that history comh is committed, 74 | // so we can garbage-collect entries before it in the key/value store. 75 | // But this Store does not implement garbage-collection. 76 | // 77 | func (fs *FileStore) Committed(comh Head) { 78 | // do nothing - no garbage collection 79 | } 80 | -------------------------------------------------------------------------------- /go/model/qscod/fs/simple/store_test.go: -------------------------------------------------------------------------------- 1 | package simple 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "testing" 7 | 8 | . "github.com/dedis/tlc/go/model/qscod/core" 9 | . "github.com/dedis/tlc/go/model/qscod/core/test" 10 | ) 11 | 12 | // Run a consensus test case with the specified parameters. 13 | func testRun(t *testing.T, nfail, nnode, ncli, maxstep, maxpri int) { 14 | 15 | // Create a test key/value store representing each node 16 | kv := make([]Store, nnode) 17 | for i := range kv { 18 | path := fmt.Sprintf("test-store-%d", i) 19 | ss := &FileStore{path} 20 | kv[i] = ss 21 | 22 | // Remove the test directory if one is left-over 23 | // from a previous test run. 24 | os.RemoveAll(path) 25 | 26 | // Create the test directory afresh. 27 | if err := os.Mkdir(path, 0744); err != nil { 28 | t.Fatal(err) 29 | } 30 | 31 | // Clean it up once the test is done. 32 | defer os.RemoveAll(path) 33 | } 34 | 35 | TestRun(t, kv, nfail, ncli, maxstep, maxpri) 36 | } 37 | 38 | func TestSimpleStore(t *testing.T) { 39 | testRun(t, 1, 3, 1, 10, 100) // Standard f=1 case, 40 | testRun(t, 1, 3, 2, 10, 100) // varying number of clients 41 | testRun(t, 1, 3, 10, 3, 100) 42 | testRun(t, 1, 3, 20, 2, 100) 43 | testRun(t, 1, 3, 40, 2, 100) 44 | 45 | testRun(t, 2, 6, 10, 5, 100) // Standard f=2 case 46 | testRun(t, 3, 9, 10, 3, 100) // Standard f=3 case 47 | testRun(t, 4, 12, 10, 2, 100) // Standard f=4 case 48 | testRun(t, 5, 15, 10, 2, 100) // Standard f=10 case 49 | } 50 | -------------------------------------------------------------------------------- /go/model/qscod/fs/store/store.go: -------------------------------------------------------------------------------- 1 | // Package store provides a file system key/value Store for QSCOD. 2 | // It uses the cas package to implement versioned write-once and read, 3 | // with garbage collection of old versions before the last known commit. 4 | // 5 | package store 6 | 7 | import ( 8 | "context" 9 | 10 | "github.com/dedis/tlc/go/lib/backoff" 11 | "github.com/dedis/tlc/go/lib/fs/verst" 12 | . "github.com/dedis/tlc/go/model/qscod/core" 13 | "github.com/dedis/tlc/go/model/qscod/encoding" 14 | ) 15 | 16 | // FileStore implements a QSCOD key/value store 17 | // as a directory in a file system. 18 | // 19 | type FileStore struct { 20 | state verst.State 21 | ctx context.Context 22 | bc backoff.Config 23 | } 24 | 25 | // Initialize FileStore to use a directory at a given file system path. 26 | // If create is true, create the designated directory if it doesn't exist. 27 | // If excl is true, fail if the designated directory already exists. 28 | func (fs *FileStore) Init(ctx context.Context, path string, create, excl bool) error { 29 | 30 | fs.ctx = ctx 31 | return fs.state.Init(path, create, excl) 32 | } 33 | 34 | // SetBackoff sets the backoff configuration for handling errors that occur 35 | // while attempting to access the key/value store on the file system. 36 | // 37 | // Since we don't know in general which errors may be transitory 38 | // and which are permanent failures, especially on remote file systems, 39 | // FileStore assumes all errors may be transitory, just reports them, 40 | // and keeps trying the access after a random exponential backoff. 41 | // 42 | func (fs *FileStore) SetReport(bc backoff.Config) { 43 | fs.bc = bc 44 | } 45 | 46 | // Attempt to write the value v to a file associated with time-step step, 47 | // then read back whichever value was successfully written first. 48 | // Implements the qscod.Store interface. 49 | // 50 | func (fs *FileStore) WriteRead(v Value) (rv Value) { 51 | 52 | // Don't try to write version 0; that's a virtual placeholder. 53 | if v.P.Step == 0 { 54 | return v 55 | } 56 | 57 | try := func() (err error) { 58 | rv, err = fs.tryWriteRead(v) 59 | return err 60 | } 61 | 62 | fs.bc.Retry(fs.ctx, try) 63 | return rv 64 | } 65 | 66 | func (fs *FileStore) tryWriteRead(val Value) (Value, error) { 67 | ver := int64(val.P.Step) 68 | 69 | // Serialize the proposed value 70 | valb, err := encoding.EncodeValue(val) 71 | if err != nil { 72 | return Value{}, err 73 | } 74 | vals := string(valb) 75 | 76 | // Try to write it to the versioned store - 77 | // but don't fret if someone else wrote it or if it has expired. 78 | err = fs.state.WriteVersion(ver, vals) 79 | if err != nil && !verst.IsExist(err) && !verst.IsNotExist(err) { 80 | return Value{}, err 81 | } 82 | 83 | // Now read back whatever value was successfully written. 84 | vals, err = fs.state.ReadVersion(ver) 85 | if err != nil && verst.IsNotExist(err) { 86 | 87 | // The requested version has probably been aged out, 88 | // so catch up to the most recent committed Head. 89 | _, vals, err = fs.state.ReadLatest() 90 | } 91 | if err != nil { 92 | return Value{}, err 93 | } 94 | 95 | // Deserialize the value we read 96 | val, err = encoding.DecodeValue([]byte(vals)) 97 | if err != nil { 98 | return Value{}, err 99 | } 100 | 101 | // Expire all versions before this latest one 102 | fs.state.Expire(int64(val.P.Step)) 103 | 104 | // Return the value v that we read 105 | return val, err 106 | } 107 | -------------------------------------------------------------------------------- /go/model/qscod/fs/store/store_test.go: -------------------------------------------------------------------------------- 1 | package store 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "os" 7 | "testing" 8 | 9 | . "github.com/dedis/tlc/go/model/qscod/core" 10 | . "github.com/dedis/tlc/go/model/qscod/core/test" 11 | ) 12 | 13 | // Run a consensus test case with the specified parameters. 14 | func testRun(t *testing.T, nfail, nnode, ncli, maxstep, maxpri int) { 15 | 16 | // Create a test key/value store representing each node 17 | kv := make([]Store, nnode) 18 | ctx := context.Background() 19 | for i := range kv { 20 | path := fmt.Sprintf("test-store-%d", i) 21 | 22 | // Remove the test directory if one is left-over 23 | // from a previous test run. 24 | os.RemoveAll(path) 25 | 26 | // Create the test directory afresh. 27 | ss := &FileStore{} 28 | if err := ss.Init(ctx, path, true, true); err != nil { 29 | t.Fatal(err) 30 | } 31 | kv[i] = ss 32 | 33 | // Clean it up once the test is done. 34 | defer os.RemoveAll(path) 35 | } 36 | 37 | TestRun(t, kv, nfail, ncli, maxstep, maxpri) 38 | } 39 | 40 | func TestSimpleStore(t *testing.T) { 41 | testRun(t, 1, 3, 1, 10, 100) // Standard f=1 case, 42 | testRun(t, 1, 3, 2, 10, 100) // varying number of clients 43 | testRun(t, 1, 3, 10, 3, 100) 44 | testRun(t, 1, 3, 20, 2, 100) 45 | testRun(t, 1, 3, 40, 2, 100) 46 | 47 | testRun(t, 2, 6, 10, 5, 100) // Standard f=2 case 48 | testRun(t, 3, 9, 10, 3, 100) // Standard f=3 case 49 | 50 | // Note: when nnode * ncli gets to be around 120-ish, 51 | // we start running into default max-open-file limits. 52 | } 53 | -------------------------------------------------------------------------------- /go/model/qscod/qscas/doc.go: -------------------------------------------------------------------------------- 1 | // Package qscas provides an implementation of QSCOD consensus 2 | // that both builds on, and provides, a Check-and-Set (CAS) Store interface 3 | // as defined by the tlc/go/lib/cas package. 4 | // 5 | package qscas 6 | -------------------------------------------------------------------------------- /go/model/qscod/qscas/group.go: -------------------------------------------------------------------------------- 1 | package qscas 2 | 3 | import ( 4 | "context" 5 | "sync" 6 | 7 | "github.com/dedis/tlc/go/lib/cas" 8 | "github.com/dedis/tlc/go/model/qscod/core" 9 | ) 10 | 11 | // Group implements the cas.Store interface as a QSCOD consensus group. 12 | // After creation, invoke Start to configure the consensus group state, 13 | // then call CompareAndSet to perform CAS operations on the logical state. 14 | type Group struct { 15 | c core.Client // consensus client core 16 | ctx context.Context // group operation context 17 | 18 | mut sync.Mutex // for synchronizing shutdown 19 | wg sync.WaitGroup // counts active CAS operations 20 | done bool // set after group shutdown 21 | 22 | // channel that CAS calls use to propose work to do 23 | ch chan func(int64, string, bool) (string, int64) 24 | } 25 | 26 | // Start initializes g to represent a consensus group comprised of 27 | // particular member nodes, starts it operating, and returns g. 28 | // 29 | // Consensus thresholds are determined by the faulty parameter, 30 | // the maximum number of faulty nodes the group should tolerate. 31 | // For this implementation of QSCOD based on the TLCB and TLCR algorithms, 32 | // faulty should be at most one-third of the total group size. 33 | // If faulty < 0, it is set to one-third of the group size, rounded down. 34 | // 35 | // Start launchers worker goroutines that help service CAS requests, 36 | // which will run and consume resources forever unless cancelled. 37 | // To define their lifetime, the caller should pass a cancelable context, 38 | // and cancel it when operations on the Group are no longer required. 39 | // 40 | func (g *Group) Start(ctx context.Context, members []cas.Store, faulty int) *Group { 41 | 42 | // Calculate and sanity-check the threshold configuration parameters. 43 | // For details on where these calculations come from, see: 44 | // https://arxiv.org/abs/2003.02291 45 | N := len(members) 46 | if faulty < 0 { 47 | faulty = N / 3 // Default fault tolerance threshold 48 | } 49 | Tr := N - faulty // receive threshold 50 | Ts := N - Tr + 1 // spread threshold 51 | if Tr <= 0 || Tr > N || Ts <= 0 || Ts > Tr || (Ts+Tr) <= N { 52 | panic("faulty threshold yields unsafe configuration") 53 | } 54 | if N*(Tr-Ts+1)-Tr*(N-Tr) <= 0 { // test if Tb <= 0 55 | panic("faulty threshold yields non-live configuration") 56 | } 57 | //println("N", N, "Tr", Tr, "Ts", Ts) 58 | 59 | // Create a consensus group state instance 60 | g.c = core.Client{Tr: Tr, Ts: Ts} 61 | g.ctx = ctx 62 | g.ch = make(chan func(s int64, p string, c bool) (string, int64)) 63 | 64 | // Create a core.Store wrapper around each cas.Store group member 65 | g.c.KV = make([]core.Store, N) 66 | for i := range members { 67 | g.c.KV[i] = &coreStore{Store: members[i], g: g} 68 | } 69 | 70 | // Our proposal function normally just "punts" by waiting for 71 | // an actual proposal to get sent on the group's channel, 72 | // and then we call that to form the proposal as appropriate. 73 | // But we concurrently listen for channel cancellation 74 | // and return promptly with a no-op proposal in that case. 75 | g.c.Pr = func(s int64, p string, c bool) (prop string, pri int64) { 76 | for { 77 | select { 78 | case f := <-g.ch: // got a CAS work function to call 79 | if f == nil { // context cancelled 80 | println("Pr: channel closed") 81 | return p, 0 // no-op proposal 82 | } 83 | //println("got work function\n") 84 | prop, pri = f(s, p, c) // call work function 85 | if prop != "" || pri != 0 { 86 | return prop, pri // return its result 87 | } 88 | //println("work function yielded no work") 89 | 90 | case <-ctx.Done(): // our context got cancelled 91 | //println("Pr: cancelled") 92 | return p, 0 // produce no-op proposal 93 | } 94 | } 95 | } 96 | 97 | // Launch the underlying consensus core as a separate goroutine. 98 | // Make sure the group's WaitGroup remains nonzero until 99 | // the context is cancelled and we're ready to shut down. 100 | g.wg.Add(1) 101 | go g.run(ctx) 102 | 103 | return g 104 | } 105 | 106 | // Run consensus in a goroutine 107 | func (g *Group) run(ctx context.Context) { 108 | 109 | // Run the consensus protocol until our context gets cancelled 110 | g.c.Run(ctx) 111 | 112 | // Drain any remaining proposal function sends to the group's channel. 113 | // CompareAndSet won't add anymore after g.ctx has been cancelled. 114 | go func() { 115 | for range g.ch { 116 | } 117 | }() 118 | 119 | g.mut.Lock() 120 | 121 | // Wait until no threads are in active CompareAndSet calls. 122 | g.wg.Done() 123 | g.wg.Wait() 124 | 125 | // Now it's safe to close the group's channel. 126 | close(g.ch) 127 | g.done = true 128 | 129 | g.mut.Unlock() 130 | } 131 | 132 | // CompareAndSet conditionally writes a new version and reads the latest, 133 | // implementing the cas.Store interface. 134 | // 135 | func (g *Group) CompareAndSet(ctx context.Context, old, new string) ( 136 | version int64, actual string, err error) { 137 | 138 | //println("CAS lastVer", lastVer, "reqVal", reqVal) 139 | 140 | // Record active CompareAndSet calls in a WaitGroup 141 | // so that the group's main goroutine can wait for them to complete 142 | // when shutting down gracefully in response to context cancellation. 143 | // Atomically check that the group is still active before wg.Add. 144 | g.mut.Lock() 145 | if g.done { 146 | //println("CAS after done") 147 | // This should only ever happen once the context is cancelled 148 | if g.ctx.Err() == nil { 149 | panic("group done but context not cancelled?") 150 | } 151 | g.mut.Unlock() 152 | return 0, "", g.ctx.Err() 153 | } 154 | g.wg.Add(1) 155 | g.mut.Unlock() 156 | defer g.wg.Done() 157 | 158 | // We'll need a mutex to protect concurrent accesses to our locals. 159 | mut := sync.Mutex{} 160 | 161 | // Define the proposal formulation function that will do our work. 162 | // Returns the empty string to keep this worker thread waiting 163 | // for something to propose while letting other threads progress. 164 | pr := func(s int64, cur string, com bool) (prop string, pri int64) { 165 | mut.Lock() 166 | defer mut.Unlock() 167 | 168 | //println("CAS step", s, cur, com, "prop", old, "->", new) 169 | 170 | // Now check the situation of what's known to be committed. 171 | switch { 172 | 173 | // It's safe to propose new as the new string to commit 174 | // if the prior value we're building on is equal to old. 175 | case cur == old: 176 | prop, pri = new, randValue() 177 | 178 | // Complete the CAS operation as soon as we commit anything, 179 | // whether it was our new proposal or some other string. 180 | case com: 181 | version, actual = int64(s), cur 182 | 183 | // Otherwise, if the current proposal isn't the same as old 184 | // but also isn't committed, we have to make no-op proposals 185 | // until we manage to get something committed. 186 | default: 187 | println("no-op proposal") 188 | prop, pri = cur, randValue() 189 | 190 | //case int64(s) > lastVer && c && p != prop: 191 | // err = cas.Changed 192 | // fallthrough 193 | //// XXX get rid of Changed? 194 | 195 | //case int64(s) > lastVer && c: 196 | // actualVer = int64(s) 197 | // actualVal = reqVal 198 | 199 | //case int64(s) > lastVer: 200 | // // do nothing 201 | 202 | // Our CAS has succeeded if we've committed a new version 203 | // that builds immediately on the version we were expecting 204 | // and that commits the reqVal we were trying to propose. 205 | // Return "" in prop to have this worker thread keep waiting 206 | // for a future CAS operation to propose something useful. 207 | // case int64(L.Step) == lastVer && C.Data == reqVal: 208 | // println("proposal committed at step", C.Step) 209 | // if int64(C.Step) <= lastVer { 210 | // panic("XXX") 211 | // } 212 | // actualVer = int64(s) 213 | // actualVal = reqVal 214 | 215 | // Otherwise, our CAS fails with a Changed error as soon as 216 | // anything else gets committed on top of lastVer. 217 | // Return "" in prop to keep this worker thread waiting. 218 | // case int64(C.Step) > lastVer: 219 | // println("proposal overridden at step", C.Step) 220 | // actualVer = int64(C.Step) 221 | // actualVal = C.Data 222 | // err = cas.Changed 223 | 224 | // If C.Step < lastVer, we're choosing a proposal for a node 225 | // that doesn't yet "know" that lastVer was committed. 226 | // Just return a "safe" no-op proposal for this node, 227 | // although we know it has no chance of being committed. 228 | // case int64(C.Step) < lastVer: 229 | // println(i, "outdated at", C.Step, "<", lastVer, 230 | // "data", C.Data) 231 | // prop, pri = C.Data, 0 232 | 233 | //default: 234 | // panic("lastVer appears to be from the future") 235 | } 236 | return 237 | } 238 | 239 | // A simple helper function to test if we've completed our work. 240 | done := func() bool { 241 | mut.Lock() 242 | defer mut.Unlock() 243 | return actual != "" || err != nil 244 | } 245 | 246 | // Continuously send references to our proposal function 247 | // to the group's channel so it will get called until it finishes 248 | // or until one of the contexts (ours or the group's) is cancelled. 249 | // Since the channel is unbuffered, each send will block 250 | // until some consensus worker thread is ready to receive it. 251 | for !done() && ctx.Err() == nil && g.ctx.Err() == nil { 252 | //println("CAS sending", old, "->", new) 253 | g.ch <- pr 254 | } 255 | // println("CAS done", lastVer, "reqVal", reqVal, 256 | // "actualVer", actualVer, "actualVal", actualVal, "err", err) 257 | return version, actual, err 258 | } 259 | -------------------------------------------------------------------------------- /go/model/qscod/qscas/group_test.go: -------------------------------------------------------------------------------- 1 | package qscas 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "testing" 7 | 8 | "github.com/dedis/tlc/go/lib/cas" 9 | "github.com/dedis/tlc/go/lib/cas/test" 10 | ) 11 | 12 | // Run a consensus test case with the specified parameters. 13 | func testRun(t *testing.T, nfail, nnode, nclients, nthreads, naccesses int) { 14 | 15 | desc := fmt.Sprintf("F=%v,N=%v,Clients=%v,Threads=%v,Accesses=%v", 16 | nfail, nnode, nclients, nthreads, naccesses) 17 | t.Run(desc, func(t *testing.T) { 18 | 19 | // Create a cancelable context for the test run 20 | ctx, cancel := context.WithCancel(context.Background()) 21 | 22 | // Create an in-memory CAS register representing each node 23 | members := make([]cas.Store, nnode) 24 | memhist := make([]test.History, nnode) 25 | for i := range members { 26 | members[i] = &cas.Register{} 27 | } 28 | 29 | // Create a consensus group Store for each simulated client 30 | clients := make([]cas.Store, nclients) 31 | for i := range clients { 32 | 33 | // Interpose checking wrappers on the CAS registers 34 | checkers := make([]cas.Store, nnode) 35 | for i := range checkers { 36 | checkers[i] = test.Checked(t, &memhist[i], 37 | members[i]) 38 | } 39 | 40 | clients[i] = (&Group{}).Start(ctx, checkers, nfail) 41 | } 42 | 43 | // Run a standard torture-test across all the clients 44 | test.Stores(t, nthreads, naccesses, clients...) 45 | 46 | // Shut down all the clients by canceling the context 47 | cancel() 48 | }) 49 | } 50 | 51 | // Test the Client with a trivial in-memory key/value Store implementation. 52 | func TestClient(t *testing.T) { 53 | testRun(t, 1, 3, 1, 1, 1000) // Standard f=1 case 54 | testRun(t, 1, 3, 2, 1, 1000) 55 | testRun(t, 1, 3, 10, 1, 1000) 56 | testRun(t, 1, 3, 20, 1, 100) 57 | testRun(t, 1, 3, 50, 1, 10) 58 | testRun(t, 1, 3, 100, 1, 10) 59 | 60 | testRun(t, 2, 6, 10, 10, 000) // Standard f=2 case 61 | testRun(t, 3, 9, 10, 10, 100) // Standard f=3 case 62 | testRun(t, 4, 12, 10, 10, 100) // Standard f=4 case 63 | testRun(t, 5, 15, 10, 10, 100) // Standard f=10 case 64 | 65 | // Test with low-entropy tickets: hurts commit rate, but still safe! 66 | testRun(t, 1, 3, 10, 10, 1000) // Extreme low-entropy: rarely commits 67 | testRun(t, 1, 3, 10, 10, 1000) // A bit better bit still bad... 68 | } 69 | -------------------------------------------------------------------------------- /go/model/qscod/qscas/rand.go: -------------------------------------------------------------------------------- 1 | package qscas 2 | 3 | import ( 4 | "crypto/rand" 5 | "encoding/binary" 6 | ) 7 | 8 | // Generate a 63-bit positive integer from strong cryptographic randomness. 9 | func randValue() int64 { 10 | var b [8]byte 11 | _, err := rand.Read(b[:]) 12 | if err != nil { 13 | panic("error reading cryptographic randomness: " + err.Error()) 14 | } 15 | return int64(binary.BigEndian.Uint64(b[:]) &^ (1 << 63)) 16 | } 17 | -------------------------------------------------------------------------------- /go/model/qscod/qscas/store.go: -------------------------------------------------------------------------------- 1 | package qscas 2 | 3 | import ( 4 | "github.com/dedis/tlc/go/lib/backoff" 5 | "github.com/dedis/tlc/go/lib/cas" 6 | "github.com/dedis/tlc/go/model/qscod/core" 7 | "github.com/dedis/tlc/go/model/qscod/encoding" 8 | ) 9 | 10 | // coreStore implements QSCOD core's native Store interface 11 | // based on a cas.Store interface. 12 | type coreStore struct { 13 | cas.Store // underlying CAS state store 14 | g *Group // group this store is associated with 15 | lvals string // last value we observed in the underlying Store 16 | lval core.Value // deserialized last value 17 | } 18 | 19 | func (cs *coreStore) WriteRead(v core.Value) (rv core.Value) { 20 | 21 | try := func() (err error) { 22 | rv, err = cs.tryWriteRead(v) 23 | return err 24 | } 25 | 26 | // Try to perform the atomic operation until it succeeds 27 | // or until the group's context gets cancelled. 28 | err := backoff.Retry(cs.g.ctx, try) 29 | if err != nil && cs.g.ctx.Err() != nil { 30 | 31 | // The group's context got cancelled, 32 | // so just silently return nil Values 33 | // until the consensus worker threads catch up and terminate. 34 | //println("WriteRead cancelled") 35 | return core.Value{} 36 | } 37 | if err != nil { 38 | panic("backoff.Retry inexplicably gave up: " + err.Error()) 39 | } 40 | return rv 41 | } 42 | 43 | func (cs *coreStore) tryWriteRead(val core.Value) (core.Value, error) { 44 | 45 | // Serialize the proposed value 46 | valb, err := encoding.EncodeValue(val) 47 | if err != nil { 48 | println("encoding error", err.Error()) 49 | return core.Value{}, err 50 | } 51 | vals := string(valb) 52 | 53 | // Try to set the underlying CAS register to the proposed value 54 | // only as long as doing so would strictly increase its TLC step 55 | for val.S > cs.lval.S { 56 | 57 | // Write the serialized value to the underlying CAS interface 58 | _, avals, err := cs.CompareAndSet(cs.g.ctx, cs.lvals, vals) 59 | if err != nil { 60 | println("CompareAndSet error", err.Error()) 61 | return core.Value{}, err 62 | } 63 | 64 | // Deserialize the actual value we read back 65 | aval, err := encoding.DecodeValue([]byte(avals)) 66 | if err != nil { 67 | println("decoding error", err.Error()) 68 | return core.Value{}, err 69 | } 70 | 71 | // println("tryWriteRead step", 72 | // cs.lval.S, "w", val.S, "->", aval.S, 73 | // "casver", cs.lver, "->", aver) 74 | 75 | if aval.S <= cs.lval.S { 76 | panic("CAS failed to advance TLC step!") 77 | } 78 | 79 | // Update our record of the underlying CAS version and value 80 | //println("update from step", cs.lval.S, "to step", aval.S) 81 | cs.lvals, cs.lval = avals, aval 82 | } 83 | 84 | //println("cs returning newer step", cs.lval.S) 85 | return cs.lval, nil 86 | } 87 | -------------------------------------------------------------------------------- /go/model/quepaxa/consensus.go: -------------------------------------------------------------------------------- 1 | package quepaxa 2 | 3 | import ( 4 | "context" 5 | "sync" 6 | ) 7 | 8 | type Node int32 9 | type Choice int64 10 | type Step int32 11 | 12 | // A logical time consists of 13 | // a Choice (consensus decision or slot number) and 14 | // a Step (consensus attempt number within a turn). 15 | type Time struct { 16 | c Choice 17 | s Step 18 | } 19 | 20 | // Returns true if logical time T1 is strictly less than T2. 21 | func (t1 Time) LT(t2 Time) bool { 22 | return t1.c < t2.c || (t1.c == t2.c && t1.s < t2.s) 23 | } 24 | 25 | type Replica[P Proposal[P]] interface { 26 | Record(ctx context.Context, t Time, p P) ( 27 | rt Time, rf P, rl P, err error) 28 | } 29 | 30 | type Proposer[P Proposal[P]] struct { 31 | 32 | // configuration state 33 | w []worker[P] // one worker per replica 34 | th int // consensus threshold (n-f) 35 | 36 | // synchronization state 37 | m sync.Mutex 38 | c sync.Cond 39 | 40 | t Time // proposer's current logical time 41 | 42 | // per-choice state 43 | ld Node // which replica is the leader, -1 if none 44 | dp P // decision proposal from last choice 45 | nf int // number of fast-path responses this choice 46 | 47 | // per-step state 48 | pp P // preferred proposal for this step 49 | bp P // best of appropriate replies this step 50 | nr int // number of responses seen so far this step 51 | 52 | // graceful termination state 53 | stop bool // signal when workers should shut down 54 | ctx context.Context // cancelable context for all of our workers 55 | cancel context.CancelFunc // cancellation function 56 | } 57 | 58 | func (p *Proposer[P]) Init(replicas []Replica[P]) { 59 | 60 | if p.w != nil { 61 | panic("Proposer.Init must not be invoked twice") 62 | } 63 | 64 | // set up a cancelable context for when we want to stop 65 | p.ctx, p.cancel = context.WithCancel(context.Background()) 66 | 67 | // set the threshold appropriately for group size 68 | p.th = len(replicas)/2 + 1 69 | 70 | p.w = make([]worker[P], len(replicas)) 71 | for i := range replicas { 72 | p.w[i].p = p 73 | p.w[i].r = replicas[i] 74 | p.w[i].i = Node(i) 75 | 76 | go p.w[i].work() 77 | } 78 | } 79 | 80 | func (p *Proposer[P]) Agree(preferred P) (choice Choice, decision P) { 81 | 82 | // keep our mutex locked except while waiting on a condition 83 | p.m.Lock() 84 | defer p.m.Unlock() 85 | 86 | c := p.t.c 87 | if p.t.s < 4 { 88 | p.advance(Time{p.t.c, 4}, preferred) 89 | } 90 | for !p.stop && p.t.c == c { 91 | // signal any non-busy workers that there's new work to do 92 | p.c.Broadcast() 93 | } 94 | 95 | // return choice at which last decision was made, and that decision 96 | return p.t.c - 1, p.dp 97 | } 98 | 99 | // Advance to time t with preferred proposap pp. 100 | // Proposer's mutex must be locked. 101 | func (p *Proposer[P]) advance(t Time, pp P) { 102 | p.t = t // new time step 103 | p.pp = pp // preferred proposal entering new step 104 | p.bp = pp.Nil() // initial best proposal from new step 105 | p.nr = 0 // count responses toward threshold 106 | 107 | if t.s == 4 { // only when advancing to fast-path step... 108 | p.nf = 0 // initialize fast-path response count 109 | } 110 | } 111 | 112 | // Each worker thread calls workDone when it gets a response from a recorder. 113 | // 114 | // This function gets called at most once per recorder per time step, 115 | // so it can count responses without worrying about duplicates. 116 | func (p *Proposer[P]) workDone(rt Time, rf, rl P) { 117 | 118 | // When we receive fast-path responses from phase 4 of current choice, 119 | // count them towards the fast-path threshold even if they come late. 120 | if rt.c == p.t.c && rt.s == 4 { 121 | p.nf++ 122 | if p.nf == p.th { 123 | p.decided(rf) // fast-path decision 124 | } 125 | } 126 | 127 | // where is the proposer with respect to the response in logical time? 128 | if rt.LT(p.t) { // is the response behind the proposer? 129 | return // the work done is obsolete - just discard 130 | } 131 | if p.t.LT(rt) { // is the response ahead of the proposer? 132 | p.advance(rt, rf) // advance to newer time in response 133 | return 134 | } 135 | // the response is from proposer's current time step exactly 136 | 137 | // what we do with the response depends on which phase we're in 138 | if rt.s&3 == 0 { 139 | p.bp = p.bp.Best(rf) // Phase 0: best of first proposals 140 | } else if rt.s&2 != 0 { 141 | p.bp = p.bp.Best(rl) // Phase 2-3: best of last aggregate 142 | } 143 | 144 | // have we reached the response threshold for this step? 145 | p.nr++ 146 | if p.nr < p.th { 147 | return // not yet, wait for more responses 148 | } 149 | // threshold reached, so we can complete this time step 150 | 151 | // in phase 2, check if we've reached a consensus decision 152 | if rt.s&3 == 2 && p.pp.EqD(p.bp) { 153 | p.decided(p.pp) 154 | return 155 | } 156 | // no decision yet but still end of current time step 157 | 158 | // in phases 0 and 3, new preferred proposal is best from replies 159 | pp := p.pp 160 | if rt.s&3 == 0 || rt.s&3 == 3 { 161 | pp = p.bp 162 | } 163 | 164 | // advance to next logical time step 165 | p.advance(Time{p.t.c, p.t.s + 1}, pp) 166 | } 167 | 168 | func (p *Proposer[P]) decided(dp P) { 169 | 170 | // record the decision in local state 171 | p.t.c++ // last choice is decided, now on to next 172 | p.t.s = 0 // idle but ready for a new agreement 173 | p.dp = dp // record decision proposal from last choice 174 | p.ld = -1 // default to no leader, but caller can change 175 | 176 | // signal the main proposer thread to return the decision, 177 | // while the workers inform the recorders asynchronously. 178 | p.c.Broadcast() 179 | } 180 | 181 | // Immediately after observing a decision being made, 182 | // the application can select a new leader based on that decision. 183 | // If SetLeader is not called, the next choice is leaderless. 184 | // The choice of leader (or lack thereof) must be deterministic 185 | // based on prior decisions and set the same on all nodes. 186 | func (p *Proposer[P]) SetLeader(leader Node) { 187 | p.ld = leader 188 | } 189 | 190 | // Stop permanently shuts down this proposer and its worker threads. 191 | func (p *Proposer[P]) Stop() { 192 | 193 | p.stop = true // signal that workers should stop 194 | p.c.Broadcast() // wake them up to see the signal 195 | p.cancel() // also cancel all Record calls in progress 196 | } 197 | 198 | // We create one worker per replica. 199 | type worker[P Proposal[P]] struct { 200 | p *Proposer[P] // back pointer to Proposer 201 | r Replica[P] // Replica interface of this replica 202 | i Node // replica number of this replica 203 | } 204 | 205 | func (w *worker[P]) work() { 206 | p := w.p // keep handy pointer back to proposer 207 | p.m.Lock() 208 | for !p.stop { 209 | // we're done with prior steps so wait until proposer advances 210 | t := p.t // save proposer's current time 211 | for p.t == t { 212 | p.c.Wait() 213 | } 214 | 215 | pp := p.pp // save proposer's preferred proposal 216 | if t.s&3 == 0 { // in phase zero we must re-rank proposals 217 | pp = pp.Rank(w.i, t.s == 4 && w.i == p.ld) 218 | } 219 | 220 | // asychronously record the proposal with mutex unlocked 221 | p.m.Unlock() 222 | rt, rf, rl, err := w.r.Record(p.ctx, t, pp) 223 | if err != nil { // canceled 224 | return 225 | // XXX backoff retry? 226 | } 227 | p.m.Lock() 228 | 229 | // inform the Proposer that this recorder's work is done 230 | p.workDone(rt, rf, rl) 231 | } 232 | p.m.Unlock() 233 | } 234 | -------------------------------------------------------------------------------- /go/model/quepaxa/isr.go: -------------------------------------------------------------------------------- 1 | package quepaxa 2 | 3 | // Interval Summary Register (ISR) 4 | type ISR[P Proposal[P]] struct { 5 | t Time // current logical time step 6 | f P // first value seen in this step 7 | a P // aggregated values so far in this step 8 | l P // aggregated values seen in last step 9 | } 10 | 11 | func (r *ISR[P]) Record(t Time, p P) (Time, P, P) { 12 | 13 | if r.t.LT(t) { 14 | // Our recorder state needs to catch up to time t 15 | if t.s == r.t.s+1 { 16 | r.l = r.a 17 | } else { 18 | r.l = r.l.Nil() 19 | } 20 | r.t = t 21 | r.f = p 22 | r.a = p 23 | 24 | } else if !t.LT(r.t) { 25 | 26 | // At exactly the right time step - just aggregate proposals 27 | r.a = r.a.Best(p) 28 | 29 | } else { 30 | // proposal p is obsolete - just discard it 31 | } 32 | 33 | // In any case, return the latest recorder state 34 | return r.t, r.f, r.l 35 | } 36 | -------------------------------------------------------------------------------- /go/model/quepaxa/proposal.go: -------------------------------------------------------------------------------- 1 | package quepaxa 2 | 3 | import ( 4 | "crypto/rand" 5 | "encoding/binary" 6 | "math" 7 | ) 8 | 9 | // The Proposal interface defines constraints for a concrete proposal type P. 10 | // 11 | // The Rank method must return the same proposal with rank set appropriately: 12 | // - to the maximum rank High if leader is set (this replica is the leader) 13 | // - to a freshly-chosen random rank between 1 and High-1 otherwise 14 | // 15 | // In addition, if proposal ranks are low-entropy so there is a chance of ties, 16 | // and P is using replica numbers for tiebreaking, 17 | // then the Rank function also sets the replica number in the proposal. 18 | type Proposal[P any] interface { 19 | Nil() P // the nil proposal 20 | Best(other P) P // best of this and other 21 | Rank(replica Node, leader bool) P // randomly rank proposal 22 | EqD(other P) bool // equality for deciding 23 | } 24 | 25 | // BasicProposal provides a basic proposal design 26 | // that represents a reasonable "sweet spot" for most purposes. 27 | // 28 | // Proposals are randomly ranked using 31 bits of private randomness, 29 | // drawn from the cryptographic random source for strong unpredictability, 30 | // which might conceivably be needed to protect against a strong DoS attacker. 31 | // Since 31-bit random ranks do not have high entropy, 32 | // BasicProposal uses recorder numbers for breaking ties. 33 | // 34 | // BasicProposal contains a field D of parameterized type Data, 35 | // containing any application-defined data associated with the proposal. 36 | // This type may contain pointers or slices (e.g., referring to bulk data) 37 | // provided the referenced data objects do not change during consensus. 38 | // The BasicProposal does nothing with this data field other than copy it. 39 | type BasicProposal[Data any] struct { 40 | R uint32 // Randomzed rank or priority 41 | N Node // Replica for which proposal was created 42 | D Data // Application-defined data 43 | } 44 | 45 | const basicProposalHighRank = math.MaxUint32 46 | 47 | func (_ BasicProposal[D]) Nil() BasicProposal[D] { 48 | return BasicProposal[D]{} 49 | } 50 | 51 | func (p BasicProposal[D]) Best(o BasicProposal[D]) BasicProposal[D] { 52 | if o.R > p.R || (o.R == p.R && o.N > p.N) { 53 | return o 54 | } 55 | return p 56 | } 57 | 58 | func (p BasicProposal[D]) Rank(node Node, leader bool) BasicProposal[D] { 59 | 60 | // record the replica number that this proposal was produced for 61 | p.N = node 62 | 63 | if leader { 64 | // the leader always uses the reserved maximum rank 65 | p.R = basicProposalHighRank 66 | 67 | } else { 68 | // read 32 bits of randomness 69 | var b [4]byte 70 | _, err := rand.Read(b[:]) 71 | if err != nil { 72 | panic("unable to read cryptographically random bits: " + 73 | err.Error()) 74 | } 75 | 76 | // produce a 31-bit rank, avoiding the zero rank 77 | p.R = (binary.BigEndian.Uint32(b[:]) & 0x7fffffff) + 1 78 | } 79 | return p 80 | } 81 | 82 | func (p BasicProposal[D]) EqD(o BasicProposal[D]) bool { 83 | return p.R == o.R && p.N == o.N 84 | } 85 | 86 | var bp BasicProposal[struct{}] 87 | var prop Proposer[BasicProposal[struct{}]] 88 | -------------------------------------------------------------------------------- /go/model/tlc.go: -------------------------------------------------------------------------------- 1 | package model 2 | 3 | // Create a copy of our message template for transmission. 4 | // Sends QSC state only for the rounds still in our window. 5 | func (n *Node) newMsg() *Message { 6 | msg := n.m // copy template 7 | msg.QSC = append([]Round{}, n.m.QSC[n.m.Step:]...) // active QSC state 8 | return &msg 9 | } 10 | 11 | // Broadcast a copy of our current message template to all nodes 12 | func (n *Node) broadcastTLC() { 13 | msg := n.newMsg() 14 | for i := 0; i < n.nnode; i++ { 15 | n.send(i, msg) 16 | } 17 | } 18 | 19 | // Advance to the next TLC time step. 20 | // 21 | // The client must invoke this function once after calling NewNode 22 | // to launch the protocol and broadcast the message for TLC time-step zero. 23 | // Thereafter, TLC advances time automatically based on network communication. 24 | // 25 | func (n *Node) Advance() { 26 | 27 | // Initialize message template with a proposal for the new time step 28 | n.m.Step++ // Advance to next time step 29 | n.m.Type = Raw // Broadcast raw proposal first 30 | n.acks = 0 // No acknowledgments received yet in this step 31 | n.wits = 0 // No threshold witnessed messages received yet 32 | 33 | // Notify the upper (QSC) layer of the advancement of time, 34 | // and let it fill in its part of the new message to broadcast. 35 | n.advanceQSC() 36 | 37 | n.broadcastTLC() // broadcast our raw proposal 38 | } 39 | 40 | // Receive is called by the client or network layer on receipt of a Message 41 | // from a peer. 42 | // Any unmarshaling that may be required must have already been done. 43 | // 44 | // This function assumes that peer-to-peer connections are ordered and reliable, 45 | // as they are when sent over Go channels or TCP/TLS connections. 46 | // It also assumes that connection or peer failures are permanent: 47 | // this implementation of QSC does not support restarting/resuming connections. 48 | // 49 | func (n *Node) Receive(msg *Message) { 50 | 51 | // Process only messages from the current or next time step. 52 | // We could accept and merge in information from older messages, 53 | // but it's perfectly safe and simpler just to ignore old messages. 54 | if msg.Step >= n.m.Step { 55 | 56 | // If msg is ahead of us, then virally catch up to it 57 | // Since we receive messages from a given peer in order, 58 | // a message we receive can be at most one step ahead of ours. 59 | if msg.Step > n.m.Step { 60 | n.Advance() 61 | } 62 | 63 | // Merge in received QSC state for rounds still in our pipeline 64 | mergeQSC(n.m.QSC[msg.Step:], msg.QSC) 65 | 66 | // Now process this message according to type. 67 | switch msg.Type { 68 | case Raw: // Acknowledge unwitnessed proposals. 69 | ack := n.newMsg() 70 | ack.Type = Ack 71 | n.send(msg.From, ack) 72 | 73 | case Ack: // Collect a threshold of acknowledgments. 74 | n.acks++ 75 | if n.m.Type == Raw && n.acks >= n.thres { 76 | n.m.Type = Wit // Prop now threshold witnessed 77 | n.witnessedQSC() 78 | n.broadcastTLC() 79 | } 80 | 81 | case Wit: // Collect a threshold of threshold witnessed messages 82 | n.wits++ // witnessed messages in this step 83 | if n.wits >= n.thres { 84 | n.Advance() // tick the clock 85 | } 86 | } 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /spin/README.md: -------------------------------------------------------------------------------- 1 | This directory contains a minimal model of 2 | Que Sera Consensus (QSC) and Threshold Logical Clocks (TLC) 3 | for the [Spin model checker](https://spinroot.com/spin/whatispin.html). 4 | To test it, simply use the provided `run.sh` script after installing Spin. 5 | 6 | For background information on QSC and TLC, 7 | and other model implementations in several languages, please see the 8 | [top level of this repository](https://github.com/dedis/tlc/). 9 | -------------------------------------------------------------------------------- /spin/qp.pml: -------------------------------------------------------------------------------- 1 | // Simple model of QuePaxa consensus. 2 | // Recorder logic runs atomically in-line within the proposer code. 3 | 4 | #define N 3 // total number of recorder (state) nodes 5 | #define F 1 // number of failures tolerated 6 | #define T (N-F) // consensus threshold required 7 | 8 | #define M 2 // number of proposers (clients) 9 | 10 | #define STEPHI 11 // highest step number to simulate 11 | #define RAND 2 // random part of fitness space is 1..RAND 12 | #define HI (RAND+1) // top priority for proposals by leader 13 | #define VALS 2 // space of preferred values is 1..VALS 14 | 15 | // A proposal is an integer divided into two bit-fields: fitness and value. 16 | #define VALBITS 4 17 | #define FITBITS 4 18 | #define VALSHIFT (0) 19 | #define FITSHIFT (VALBITS) 20 | #define PROP(f,v) (((f)<> VALSHIFT) & ((1 << VALBITS)-1)) 22 | #define FIT(p) (((p) >> FITSHIFT) & ((1 << FITBITS)-1)) 23 | 24 | #define MAX(a, b) ((a) > (b) -> (a) : (b)) 25 | 26 | // Recorder state: implements an interval summary register (ISR), 27 | // which returns the first value submitted in this time step 28 | // and the maximum of all values submitted in the prior time step 29 | typedef Rec { 30 | byte s; // step number 31 | byte f; // first value submitted in this step 32 | byte a; // maximum value seen so far in this step 33 | byte m; // maximum value seen in prior step (s-1) 34 | } 35 | 36 | Rec rec[1+N]; // state of recorder nodes 1..N 37 | byte decided; // proposed value that we've decided on 38 | byte leader; // which proposer is the well-known leader 39 | 40 | #define DECIDE(j, s, p) atomic { \ 41 | printf("%d step %d decided <%d,%d>", j, s, FIT(p), VAL(p)); \ 42 | assert(decided == 0 || decided == VAL(p)); \ 43 | decided = VAL(p); \ 44 | } 45 | 46 | 47 | // We model one process per proposer. 48 | proctype Proposer(byte j) { // We're proposer j in 1..M 49 | byte s, t; 50 | byte p, g; 51 | byte i, recs, mask; // recorders we've interacted with 52 | bit done; // for detecting early-decision opportunities 53 | 54 | // Choose the arbitrary initial "preferred value" of this proposer 55 | s = 4; 56 | select (t : 1 .. VALS); // select a "random" value into temporary 57 | p = PROP(HI, t); 58 | printf("%d proposing %d\n", j, t); 59 | 60 | do // iterate over time-steps 61 | :: s <= STEPHI -> 62 | printf("%d step %d\n", j, s); 63 | 64 | // Send and get reply from threshold of recorders 65 | recs = 0; // number of recorders we've heard from 66 | mask = 0; // bit mask of those recorders 67 | g = 0; // gather best response proposer saw so far 68 | done = true; 69 | select (i : 1 .. N); // first recorder to interact with 70 | do // interact with the recorders in any order 71 | :: recs < T && (mask & (1 << i)) == 0 -> 72 | 73 | atomic { 74 | // Randomize fitnesses if we're not the leader 75 | if 76 | :: (s & 3) == 0 && j != leader -> 77 | select(t : 1 .. RAND); 78 | p = PROP(t, VAL(p)); 79 | :: else -> skip 80 | fi 81 | assert(FIT(p) > 0 && VAL(p) > 0); 82 | 83 | // enter the recorder/ISR role (via "RPC"). 84 | printf("%d step %d ISR <%d,%d> to %d\n", 85 | j, s, FIT(p), VAL(p), i); 86 | 87 | // first catch up the recorder if appropriate 88 | if 89 | :: s > rec[i].s -> 90 | rec[i].m = ((s == (rec[i].s+1)) -> 91 | rec[i].a : 0); 92 | rec[i].s = s; 93 | rec[i].f = p; 94 | rec[i].a = p; 95 | 96 | :: s == rec[i].s -> 97 | rec[i].a = MAX(rec[i].a, p); 98 | 99 | :: else -> skip 100 | fi 101 | 102 | // we're back to the proposer's logic now, 103 | // incorporating the recorder's "response". 104 | assert(s <= rec[i].s); 105 | if 106 | :: s == rec[i].s && (s & 3) == 0 -> 107 | g = MAX(g, rec[i].f); // gather props 108 | done = done && (FIT(rec[i].f) == HI); 109 | 110 | :: s == rec[i].s && (s & 3) == 1 -> skip 111 | 112 | :: s == rec[i].s && (s & 3) >= 2 -> 113 | printf("%d step %d got <%d,%d> from %d\n", j, s, FIT(rec[i].m), VAL(rec[i].m), i); 114 | g = MAX(g, rec[i].m); // gather E/C 115 | 116 | :: s < rec[i].s -> // catch up proposer 117 | s = rec[i].s; 118 | p = rec[i].f; 119 | break; 120 | fi 121 | assert(s == rec[i].s); 122 | 123 | recs++; // this recorder has now "replied" 124 | mask = mask | (1 << i); 125 | 126 | select (i : 1 .. N); // choose next recorder 127 | 128 | } // atomic 129 | 130 | :: recs < T && (mask & (1 << i)) != 0 -> 131 | // we've already gotten a reply from this recorder, 132 | // so just pick a different one. 133 | select (i : 1 .. N); 134 | 135 | :: recs == T -> // we've heard from a threshold of recorders 136 | 137 | if 138 | :: (s & 3) == 0 -> // propose phase 139 | assert(FIT(g) > 0 && VAL(g) > 0); 140 | p = g; // pick best of some E set 141 | 142 | // Decide early if all proposals were HI fit 143 | if 144 | :: done -> 145 | DECIDE(j, s, p); 146 | :: else -> skip 147 | fi 148 | 149 | :: (s & 3) == 1 -> skip // spreadE phase 150 | 151 | :: (s & 3) == 2 -> // gatherEspreadC phase 152 | // p is now the best of a U set; 153 | // g is the best of all gathered E sets 154 | assert(FIT(g) > 0 && VAL(g) > 0); 155 | if 156 | :: p == g -> 157 | DECIDE(j, s, p); 158 | :: else -> skip 159 | fi 160 | 161 | :: (s & 3) == 3 -> // gatherC phase 162 | // g is the best of all gathered C sets. 163 | // this is our proposal for the next round. 164 | assert(FIT(g) > 0 && VAL(g) > 0); 165 | p = g; 166 | fi 167 | s = s + 1; 168 | break; 169 | od 170 | 171 | :: s > STEPHI -> // we've simulated enough time-steps 172 | break; 173 | od 174 | } 175 | 176 | init { 177 | assert(HI < 1 << FITBITS); 178 | assert(VALS < 1 << VALBITS); 179 | 180 | decided = 0; // we haven't decided yet 181 | 182 | // first choose the "well-known" leader, or 0 for no leader 183 | //leader = 0; // no leader 184 | leader = 1; // fixed leader 185 | //select (leader : 0 .. M); // any (or no) leader 186 | 187 | atomic { 188 | int i; 189 | for (i : 1 .. M) { // Launch M proposers 190 | run Proposer(i) 191 | } 192 | } 193 | } 194 | 195 | -------------------------------------------------------------------------------- /spin/qpm.pml: -------------------------------------------------------------------------------- 1 | // Simple model of QuePaxa consensus. 2 | // Uses explicit message-based communication with recorders. 3 | 4 | #define N 3 // total number of recorder (state) nodes 5 | #define F 1 // number of failures tolerated 6 | #define T (N-F) // consensus threshold required 7 | 8 | #define M 2 // number of proposers (clients) 9 | 10 | #define STEPHI 11 // highest step number to simulate 11 | #define RAND 2 // random part of fitness space is 1..RAND 12 | #define HI (RAND+1) // top priority for proposals by leader 13 | #define VALS 2 // space of preferred values is 1..VALS 14 | 15 | // A proposal is an integer divided into two bit-fields: fitness and value. 16 | #define VALBITS 4 17 | #define FITBITS 4 18 | #define VALSHIFT (0) 19 | #define FITSHIFT (VALBITS) 20 | #define PROP(f,v) (((f)<> VALSHIFT) & ((1 << VALBITS)-1)) 22 | #define FIT(p) (((p) >> FITSHIFT) & ((1 << FITBITS)-1)) 23 | 24 | #define MAX(a, b) ((a) > (b) -> (a) : (b)) 25 | 26 | byte leader; // which proposer is the well-known leader 27 | byte decided; // proposed value that we've decided on 28 | byte propsdone; // number of proposers that have finished 29 | 30 | // Channels for recorder/proposer communication. 31 | chan creq[1+N] = [0] of { byte, byte, byte } // 32 | chan crep[1+M] = [0] of { byte, byte, byte, byte, byte} // 33 | 34 | #define DECIDE(j, s, p) atomic { \ 35 | printf("%d step %d decided <%d,%d>", j, s, FIT(p), VAL(p)); \ 36 | assert(decided == 0 || decided == VAL(p)); \ 37 | decided = VAL(p); \ 38 | } 39 | 40 | // Each proposer is a process. 41 | proctype Proposer(byte j) { // We're proposer j in 1..M 42 | byte s; 43 | byte p, g; 44 | byte ri, rs, rsn, rfn, rmn; // responses we get from recorders 45 | byte i, sent, recs; // request send and reply receiving state 46 | bit done; // for detecting early-decision opportunities 47 | 48 | // Choose the arbitrary initial "preferred value" of this proposer 49 | s = 4; 50 | select (p : 1 .. VALS); // select a "random" value into temporary 51 | printf("%d proposing %d\n", j, p); 52 | p = PROP(HI, p); 53 | 54 | // Initialize per-step state for the first step of the first round. 55 | printf("%d step %d\n", j, s); 56 | sent = 0; // bit mask of recorders we've sent to 57 | recs = 0; // number of recorders we've heard from 58 | g = 0; // gather best response proposer saw so far 59 | done = true; 60 | 61 | i = 0; // first, send to a channel no one listens on 62 | do 63 | :: creq[i] ! j, s, p -> // send a request for this step to recorder i 64 | printf("%d step %d sent <%d,%d> to %d\n", 65 | j, s, FIT(p), VAL(p), i); 66 | sent = sent | (1 << i); // successfully sent 67 | i = 0; // now we have no target again 68 | 69 | :: s <= STEPHI && recs < T -> // choose a recorder to send to 70 | 71 | // randomize fitness in phase 0 if we're not the leader 72 | if 73 | :: (s & 3) == 0 && j != leader -> 74 | byte r; 75 | select(r : 1 .. RAND); 76 | p = PROP(r, VAL(p)); 77 | :: else -> skip 78 | fi 79 | assert(FIT(p) > 0 && VAL(p) > 0); 80 | 81 | // choose a recorder that we haven't already sent a request to 82 | // revert to i=0 if we've already sent to selected recorder 83 | select (i : 1 .. N); 84 | i = ((sent & (1 << i)) == 0 -> i : 0); 85 | 86 | :: crep[j] ? ri, rs, rsn, rfn, rmn -> // get response from a recorder 87 | printf("%d step %d recv %d %d <%d,%d>,<%d,%d> from %d\n", 88 | j, s, rs, rsn, FIT(rfn), VAL(rfn), 89 | FIT(rmn), VAL(rmn), i); 90 | assert(rs <= s); // should get replies only to requests 91 | if 92 | :: rs < s -> skip // discard old unneeded replies 93 | 94 | :: rs == s && rsn > s -> // catch up to new recorder state 95 | s = rsn; // adopt recorder's round start state 96 | p = rfn; 97 | 98 | // initialize per-step state for the new time-step 99 | printf("%d step %d\n", j, s); 100 | sent = 0; // bit mask of recorders we've sent to 101 | recs = 0; // number of recorders we've heard from 102 | g = 0; // best response proposer saw so far 103 | done = true; 104 | 105 | :: rs == s && rsn == s && (s & 3) == 0 -> // propose phase 106 | g = MAX(g, rfn); // gather best of all first proposals 107 | done = done && (FIT(rfn) == HI); 108 | recs++; // this recorder has now replied 109 | 110 | :: rs == s && rsn == s && (s & 3) == 1 -> // spread E phase 111 | recs++; // this recorder has now replied 112 | 113 | :: rs == s && rsn == s && (s & 3) >= 2 -> // gather E spread C 114 | g = MAX(g, rmn); // gather best of E or C sets 115 | recs++; // this recorder has now replied 116 | fi 117 | assert(recs <= N); // shouldn't get any extra replies 118 | 119 | ri = 0; // clear temporaries 120 | rs = 0; 121 | rsn = 0; 122 | rfn = 0; 123 | rmn = 0; 124 | 125 | :: s <= STEPHI && recs >= T -> // got a quorum of replies 126 | 127 | // handle the proposer's completion of this round 128 | if 129 | :: (s & 3) == 0 -> // propose phase 130 | assert(FIT(g) > 0 && VAL(g) > 0); 131 | p = g; // pick best of some E set 132 | 133 | // Decide early if all proposals were HI fit 134 | if 135 | :: done -> 136 | DECIDE(j, s, p); 137 | :: else -> skip 138 | fi 139 | 140 | :: (s & 3) == 1 -> skip // spread E phase: nothing to do 141 | 142 | :: (s & 3) == 2 -> // gather E spread C phase 143 | // p is now the best of some universal (U) set; 144 | // g is the best of all the E sets we gathered. 145 | assert(FIT(g) > 0 && VAL(g) > 0); 146 | if 147 | :: p == g -> 148 | DECIDE(j, s, p); 149 | :: else -> skip 150 | fi 151 | 152 | :: (s & 3) == 3 -> // gather C phase 153 | // g is the best of all common (C) sets we gathered; 154 | // this becomes our proposal for the next round. 155 | assert(FIT(g) > 0 && VAL(g) > 0); 156 | p = g; 157 | fi 158 | 159 | // proceed to next logical time-step 160 | s = s + 1; 161 | 162 | // initialize per-step state for the new time-step 163 | printf("%d step %d\n", j, s); 164 | sent = 0; // bit mask of recorders we've sent to 165 | recs = 0; // number of recorders we've heard from 166 | g = 0; // best response proposer saw so far 167 | done = true; 168 | 169 | :: s > STEPHI -> // we've simulated enough time-steps 170 | break; 171 | od 172 | 173 | // count terminated proposers so recorders can terminate too 174 | atomic { 175 | propsdone++; 176 | } 177 | } 178 | 179 | // Each recorder is a process implementing an interval summary register (ISR). 180 | proctype Recorder(byte i) { // We're proposer j in 1..M 181 | byte s, f, a, m; 182 | byte rj, rs, rv; 183 | 184 | do 185 | :: creq[i] ? rj, rs, rv -> // got request from proposer rj 186 | if 187 | :: rs == s -> 188 | a = MAX(a, rv); // accumulate max of all values 189 | 190 | :: rs > s -> // forward to a later step 191 | m = (rs == s+1 -> a : 0); 192 | s = rs; 193 | f = rv; 194 | a = rv; 195 | 196 | :: else -> skip 197 | fi 198 | 199 | // send reply to the proposer -- 200 | // but don't block forever if all proposers terminate. 201 | if 202 | :: crep[rj] ! i, rs, s, f, m // reply succeeded 203 | :: propsdone == M -> break // done while trying to send 204 | fi 205 | 206 | rj = 0; // clear temporaries 207 | rs = 0; 208 | rv = 0; 209 | 210 | :: propsdone == M -> // all proposers terminated? 211 | break; // terminate recorder thread 212 | od 213 | } 214 | 215 | // The initialization process just gets things launched. 216 | init { 217 | assert(HI < 1 << FITBITS); 218 | assert(VALS < 1 << VALBITS); 219 | 220 | decided = 0; // we haven't decided yet 221 | 222 | // first choose the "well-known" leader, or 0 for no leader 223 | //leader = 0; // no leader 224 | leader = 1; // fixed leader 225 | //select (leader : 0 .. M); // any (or no) leader 226 | 227 | atomic { 228 | int i, j; 229 | 230 | for (i : 1 .. N) { // Launch N recorders 231 | run Recorder(i) 232 | } 233 | for (j : 1 .. M) { // Launch M proposers 234 | run Proposer(j) 235 | } 236 | } 237 | } 238 | 239 | -------------------------------------------------------------------------------- /spin/qsc.pml: -------------------------------------------------------------------------------- 1 | 2 | #define N 3 // total number of nodes 3 | #define Fa 1 // max number of availability failures 4 | #define Fc 0 // max number of correctness failures 5 | #define T (Fa+Fc+1) // consensus threshold required 6 | 7 | #define STEPS 3 // TLC time-steps per consensus round 8 | #define ROUNDS 2 // number of consensus rounds to run 9 | #define TICKETS 3 // proposal lottery ticket space 10 | 11 | // TLC state for each logical time-step 12 | typedef Step { 13 | bit sent; // true if we've sent our raw proposal 14 | bit seen[1+N]; // nodes whose raw proposals we've received 15 | bit ackd[1+N]; // nodes who have acknowledged our raw proposal 16 | bit witd; // true if our proposal is threshold witnessed 17 | bit witn[1+N]; // nodes we've gotten threshold witnessed msgs from 18 | } 19 | 20 | // QSC summary information for a "best" proposal seen so far 21 | typedef Best { 22 | byte from; // node number the proposal is from, 0 if tied spoiler 23 | byte tkt; // proposal's genetic fitness ticket value 24 | } 25 | 26 | // TLC and QSC state per round 27 | typedef Round { 28 | Step step[STEPS]; // TLC state for each logical time-step 29 | 30 | byte ticket; // QSC lottery ticket assigned to proposal at t+0 31 | Best spoil; // best potential spoiler(s) we've found so far 32 | Best conf; // best confirmed proposal we've seen so far 33 | Best reconf; // best reconfirmed proposal we've seen so far 34 | byte picked; // which proposal this node picked this round, 0 if not yet 35 | } 36 | 37 | // Per-node state 38 | typedef Node { 39 | Round rnd[ROUNDS]; // each node's per-consensus-round information 40 | } 41 | 42 | Node node[1+N]; // all state of each node 1..N 43 | 44 | 45 | // Implement a given node i. 46 | proctype NodeProc(byte i) { 47 | byte j, r, s, tkt, step, acks, wits; 48 | 49 | for (r : 0 .. ROUNDS-1) { 50 | 51 | atomic { 52 | // select a "random" (here just arbitrary) ticket 53 | select (tkt : 1 .. TICKETS); 54 | node[i].rnd[r].ticket = tkt; 55 | 56 | // start with our own proposal as best potential spoiler 57 | node[i].rnd[r].spoil.from = i; 58 | node[i].rnd[r].spoil.tkt = tkt; 59 | } // atomic 60 | 61 | // Run the round to completion 62 | for (s : 0 .. STEPS-1) { 63 | 64 | // "send" the broadcast for this time-step 65 | node[i].rnd[r].step[s].sent = 1; 66 | 67 | // collect a threshold of other nodes' broadcasts 68 | acks = 0; 69 | wits = 0; 70 | do 71 | :: // Pick another node to "receive" a message from 72 | select (j : 1 .. N); 73 | atomic { 74 | 75 | // Track the best potential spoiler we encounter 76 | if 77 | // Node j knows about a strictly better potential spoiler 78 | :: node[j].rnd[r].spoil.tkt > node[i].rnd[r].spoil.tkt -> 79 | node[i].rnd[r].spoil.from = node[j].rnd[r].spoil.from; 80 | node[i].rnd[r].spoil.tkt = node[j].rnd[r].spoil.tkt; 81 | 82 | // Node j knows about a spoiler that's tied with our best 83 | :: node[j].rnd[r].spoil.tkt == node[i].rnd[r].spoil.tkt && 84 | node[j].rnd[r].spoil.from != node[i].rnd[r].spoil.from -> 85 | node[i].rnd[r].spoil.from = 0; // tied, so mark invalid 86 | 87 | :: else -> skip 88 | fi 89 | 90 | // Track the best confirmed proposal we encounter 91 | if 92 | :: node[j].rnd[r].conf.tkt > node[i].rnd[r].conf.tkt -> 93 | node[i].rnd[r].conf.from = node[j].rnd[r].conf.from; 94 | node[i].rnd[r].conf.tkt = node[j].rnd[r].conf.tkt; 95 | :: else -> skip 96 | fi 97 | 98 | // Track the best reconfirmed proposal we encounter 99 | if 100 | :: node[j].rnd[r].reconf.tkt > node[i].rnd[r].reconf.tkt -> 101 | node[i].rnd[r].reconf.from = node[j].rnd[r].reconf.from; 102 | node[i].rnd[r].reconf.tkt = node[j].rnd[r].reconf.tkt; 103 | :: else -> skip 104 | fi 105 | 106 | // Now handle specific types of messages: Raw, Ack, or Wit. 107 | if 108 | 109 | // We "receive" a raw unwitnessed message from node j 110 | :: node[j].rnd[r].step[s].sent && !node[i].rnd[r].step[s].seen[j] -> 111 | 112 | node[i].rnd[r].step[s].seen[j] = 1; 113 | 114 | // We "receive" an acknowledgment of our message from node j 115 | :: node[j].rnd[r].step[s].seen[i] && !node[i].rnd[r].step[s].ackd[j] -> 116 | 117 | node[i].rnd[r].step[s].ackd[j] = 1; 118 | acks++; 119 | if 120 | :: acks >= T -> 121 | // Our proposal is now fully threshold witnessed 122 | node[i].rnd[r].step[s].witd = 1 123 | 124 | // See if our proposal is now the best confirmed proposal 125 | if 126 | :: s == 0 && 127 | node[i].rnd[r].ticket > node[i].rnd[r].conf.tkt -> 128 | node[i].rnd[r].conf.from = i; 129 | node[i].rnd[r].conf.tkt = node[i].rnd[r].ticket; 130 | :: else -> skip 131 | fi 132 | 133 | // See if we're reconfirming a best confirmed proposal 134 | if 135 | :: s == 1 && 136 | node[i].rnd[r].conf.tkt > node[i].rnd[r].reconf.tkt -> 137 | node[i].rnd[r].reconf.from = node[i].rnd[r].conf.from; 138 | node[i].rnd[r].reconf.tkt = node[i].rnd[r].conf.tkt; 139 | :: else -> skip 140 | fi 141 | 142 | :: else -> skip 143 | fi 144 | 145 | // We "receive" a fully threshold witnessed message from node j 146 | :: node[j].rnd[r].step[s].witd && !node[i].rnd[r].step[s].witn[j] -> 147 | 148 | node[i].rnd[r].step[s].witn[j] = 1 149 | wits++; 150 | 151 | // End this step if we've seen enough witnessed proposals 152 | :: wits >= T -> break; 153 | 154 | :: else -> skip 155 | fi 156 | } // atomic 157 | od 158 | } 159 | 160 | atomic { 161 | printf("%d best spoiler %d ticket %d\n", 162 | i, node[i].rnd[r].spoil.from, node[i].rnd[r].spoil.tkt); 163 | printf("%d best confirmed %d ticket %d\n", 164 | i, node[i].rnd[r].conf.from, node[i].rnd[r].conf.tkt); 165 | printf("%d best reconfirmed %d ticket %d\n", 166 | i, node[i].rnd[r].reconf.from, node[i].rnd[r].reconf.tkt); 167 | 168 | // The round is now complete in terms of picking a proposal. 169 | node[i].rnd[r].picked = node[i].rnd[r].conf.from; 170 | 171 | // We can be sure everyone has converged on this proposal 172 | // if it is also the best spoiler and best reconfirmed proposal. 173 | if 174 | :: node[i].rnd[r].spoil.from == node[i].rnd[r].picked && 175 | node[i].rnd[r].reconf.from == node[i].rnd[r].picked -> 176 | printf("%d round %d definitely COMMITTED\n", i, r); 177 | 178 | // Verify that what we decided doesn't conflict with 179 | // the proposal any other node chooses. 180 | select (j : 1 .. N); 181 | assert(!node[j].rnd[r].picked || 182 | (node[j].rnd[r].picked == node[i].rnd[r].picked)); 183 | 184 | :: node[i].rnd[r].reconf.from != node[i].rnd[r].picked -> 185 | printf("%d round %d FAILED to be reconfirmed\n", i, r); 186 | 187 | :: node[i].rnd[r].spoil.from != node[i].rnd[r].picked -> 188 | printf("%d round %d FAILED due to spoiler\n", i, r); 189 | 190 | :: node[i].rnd[r].spoil.from == 0 -> 191 | printf("%d round %d FAILED due to tie\n", i, r); 192 | 193 | :: else -> 194 | fi 195 | } // atomic 196 | } 197 | } 198 | 199 | init { 200 | atomic { 201 | int i; 202 | for (i : 1 .. N) { 203 | run NodeProc(i) 204 | } 205 | } 206 | } 207 | 208 | -------------------------------------------------------------------------------- /spin/results-qp.txt: -------------------------------------------------------------------------------- 1 | qp.pml verification: 2 | 3 | Bitstate verification using spin -search -O2 -safety -DMEMLIM=60000 $1 4 | Results from running on Bryan's 2019 MacBook Pro M1 Max. 5 | 6 | --- 7 | 7 steps (1 full consensus round in steps 4-7): 8 | 9 | Depth= 180 States= 1.4e+07 Transitions= 2.51e+07 Memory= 1088.105 t= 5.64 R= 2e+06 10 | 11 | (Spin Version 6.5.2 -- 6 December 2019) 12 | + Partial Order Reduction 13 | 14 | Full statespace search for: 15 | never claim - (none specified) 16 | assertion violations + 17 | cycle checks - (disabled by -DSAFETY) 18 | invalid end states + 19 | 20 | State-vector 60 byte, depth reached 180, errors: 0 21 | 14561376 states, stored 22 | 11554544 states, matched 23 | 26115920 transitions (= stored+matched) 24 | 14444193 atomic steps 25 | hash conflicts: 3406143 (resolved) 26 | 27 | Stats on memory usage (in Megabytes): 28 | 1222.039 equivalent memory usage for states (stored*(State-vector + overhead)) 29 | 998.371 actual memory usage for states (compression: 81.70%) 30 | state-vector as stored = 44 byte + 28 byte overhead 31 | 128.000 memory used for hash table (-w24) 32 | 0.534 memory used for DFS stack (-m10000) 33 | 1126.581 total actual memory usage 34 | 35 | 36 | unreached in proctype Proposer 37 | (0 of 114 states) 38 | unreached in init 39 | (0 of 16 states) 40 | 41 | pan: elapsed time 5.92 seconds 42 | pan: rate 2459691.9 states/second 43 | 44 | 45 | --- 46 | 8 steps: 47 | 48 | Depth= 220 States= 4.3e+07 Transitions= 7.36e+07 Memory= 3559.675 t= 22.4 R= 2e+06 49 | 50 | (Spin Version 6.5.2 -- 6 December 2019) 51 | + Partial Order Reduction 52 | 53 | Full statespace search for: 54 | never claim - (none specified) 55 | assertion violations + 56 | cycle checks - (disabled by -DSAFETY) 57 | invalid end states + 58 | 59 | State-vector 60 byte, depth reached 220, errors: 0 60 | 43443684 states, stored 61 | 31011944 states, matched 62 | 74455628 transitions (= stored+matched) 63 | 38765865 atomic steps 64 | hash conflicts: 24394276 (resolved) 65 | 66 | Stats on memory usage (in Megabytes): 67 | 3645.939 equivalent memory usage for states (stored*(State-vector + overhead)) 68 | 3078.502 actual memory usage for states (compression: 84.44%) 69 | state-vector as stored = 46 byte + 28 byte overhead 70 | 512.000 memory used for hash table (-w26) 71 | 0.534 memory used for DFS stack (-m10000) 72 | 3590.144 total actual memory usage 73 | 74 | 75 | unreached in proctype Proposer 76 | (0 of 114 states) 77 | unreached in init 78 | (0 of 16 states) 79 | 80 | pan: elapsed time 22.6 seconds 81 | pan: rate 1919738.6 states/second 82 | 83 | 84 | --- 85 | 9 steps: 86 | 87 | Depth= 262 States= 1.17e+08 Transitions= 2.12e+08 Memory= 8642.683 t= 67.9 R= 2e+06 88 | 89 | (Spin Version 6.5.2 -- 6 December 2019) 90 | + Partial Order Reduction 91 | 92 | Full statespace search for: 93 | never claim - (none specified) 94 | assertion violations + 95 | cycle checks - (disabled by -DSAFETY) 96 | invalid end states + 97 | 98 | State-vector 60 byte, depth reached 262, errors: 0 99 | 1.1701493e+08 states, stored 100 | 95266996 states, matched 101 | 2.1228193e+08 transitions (= stored+matched) 102 | 1.3433094e+08 atomic steps 103 | hash conflicts: 80036981 (resolved) 104 | 105 | Stats on memory usage (in Megabytes): 106 | 9820.284 equivalent memory usage for states (stored*(State-vector + overhead)) 107 | 8133.282 actual memory usage for states (compression: 82.82%) 108 | state-vector as stored = 45 byte + 28 byte overhead 109 | 512.000 memory used for hash table (-w26) 110 | 0.534 memory used for DFS stack (-m10000) 111 | 2.157 memory lost to fragmentation 112 | 8643.659 total actual memory usage 113 | 114 | 115 | unreached in proctype Proposer 116 | (0 of 114 states) 117 | unreached in init 118 | (0 of 16 states) 119 | 120 | pan: elapsed time 67.9 seconds 121 | pan: rate 1722834.7 states/second 122 | 123 | 124 | --- 125 | 10 steps: 126 | 127 | Depth= 302 States= 1.93e+08 Transitions= 3.66e+08 Memory= 15855.624 t= 121 R= 2e+06 128 | 129 | (Spin Version 6.5.2 -- 6 December 2019) 130 | + Partial Order Reduction 131 | 132 | Full statespace search for: 133 | never claim - (none specified) 134 | assertion violations + 135 | cycle checks - (disabled by -DSAFETY) 136 | invalid end states + 137 | 138 | State-vector 60 byte, depth reached 302, errors: 0 139 | 1.9397366e+08 states, stored 140 | 1.7384229e+08 states, matched 141 | 3.6781595e+08 transitions (= stored+matched) 142 | 2.374664e+08 atomic steps 143 | hash conflicts: 1.350289e+08 (resolved) 144 | 145 | Stats on memory usage (in Megabytes): 146 | 16278.918 equivalent memory usage for states (stored*(State-vector + overhead)) 147 | 13877.107 actual memory usage for states (compression: 85.25%) 148 | state-vector as stored = 47 byte + 28 byte overhead 149 | 2048.000 memory used for hash table (-w28) 150 | 0.534 memory used for DFS stack (-m10000) 151 | 3.122 memory lost to fragmentation 152 | 15922.519 total actual memory usage 153 | 154 | 155 | unreached in proctype Proposer 156 | (0 of 114 states) 157 | unreached in init 158 | (0 of 16 states) 159 | 160 | pan: elapsed time 122 seconds 161 | pan: rate 1595047 states/second 162 | 163 | 164 | --- 165 | 11 steps (2 full consensus rounds: steps 4-7 and 8-11): 166 | 167 | Depth= 338 States= 2.45e+08 Transitions= 4.68e+08 Memory= 19425.351 t= 153 R= 2e+06 168 | 169 | (Spin Version 6.5.2 -- 6 December 2019) 170 | + Partial Order Reduction 171 | 172 | Full statespace search for: 173 | never claim - (none specified) 174 | assertion violations + 175 | cycle checks - (disabled by -DSAFETY) 176 | invalid end states + 177 | 178 | State-vector 60 byte, depth reached 338, errors: 0 179 | 2.4529035e+08 states, stored 180 | 2.2295857e+08 states, matched 181 | 4.6824892e+08 transitions (= stored+matched) 182 | 3.0213691e+08 atomic steps 183 | hash conflicts: 1.5778641e+08 (resolved) 184 | 185 | Stats on memory usage (in Megabytes): 186 | 20585.585 equivalent memory usage for states (stored*(State-vector + overhead)) 187 | 17400.834 actual memory usage for states (compression: 84.53%) 188 | state-vector as stored = 46 byte + 28 byte overhead 189 | 2048.000 memory used for hash table (-w28) 190 | 0.534 memory used for DFS stack (-m10000) 191 | 4.096 memory lost to fragmentation 192 | 19445.272 total actual memory usage 193 | 194 | 195 | unreached in proctype Proposer 196 | (0 of 114 states) 197 | unreached in init 198 | (0 of 16 states) 199 | 200 | pan: elapsed time 153 seconds 201 | pan: rate 1600380.7 states/second 202 | 203 | 204 | --- 205 | 12 steps: 206 | 207 | Depth= 378 States= 3.85e+08 Transitions= 7.09e+08 Memory= 28987.069 t= 244 R= 2e+06 208 | 209 | (Spin Version 6.5.2 -- 6 December 2019) 210 | + Partial Order Reduction 211 | 212 | Full statespace search for: 213 | never claim - (none specified) 214 | assertion violations + 215 | cycle checks - (disabled by -DSAFETY) 216 | invalid end states + 217 | 218 | State-vector 60 byte, depth reached 378, errors: 0 219 | 3.8578596e+08 states, stored 220 | 3.2452935e+08 states, matched 221 | 7.1031531e+08 transitions (= stored+matched) 222 | 4.3387088e+08 atomic steps 223 | hash conflicts: 2.5898853e+08 (resolved) 224 | 225 | Stats on memory usage (in Megabytes): 226 | 32376.446 equivalent memory usage for states (stored*(State-vector + overhead)) 227 | 26998.934 actual memory usage for states (compression: 83.39%) 228 | state-vector as stored = 45 byte + 28 byte overhead 229 | 2048.000 memory used for hash table (-w28) 230 | 0.534 memory used for DFS stack (-m10000) 231 | 6.396 memory lost to fragmentation 232 | 29041.073 total actual memory usage 233 | 234 | 235 | unreached in proctype Proposer 236 | (0 of 114 states) 237 | unreached in init 238 | (0 of 16 states) 239 | 240 | pan: elapsed time 244 seconds 241 | pan: rate 1579665.7 states/second 242 | 243 | -------------------------------------------------------------------------------- /spin/results-qpm.txt: -------------------------------------------------------------------------------- 1 | qpm.pml verification: 2 | 3 | Bitstate verification using spin -search -O2 -safety -bitstate -w38 $1 4 | (32GB state hash table). 5 | Results from running on Bryan's 2019 MacBook Pro M1 Max. 6 | 7 | --- 8 | 4 steps: 9 | 10 | Depth= 1007 States= 3e+06 Transitions= 4.93e+06 Memory= 32768.925 t= 3.14 R= 1e+06 11 | 12 | (Spin Version 6.5.2 -- 6 December 2019) 13 | + Partial Order Reduction 14 | 15 | Bit statespace search for: 16 | never claim - (none specified) 17 | assertion violations + 18 | cycle checks - (disabled by -DSAFETY) 19 | invalid end states + 20 | 21 | State-vector 168 byte, depth reached 1007, errors: 0 22 | 3871180 states, stored 23 | 2503227 states, matched 24 | 6374407 transitions (= stored+matched) 25 | 20 atomic steps 26 | 27 | hash factor: 71006.2 (best if > 100.) 28 | 29 | bits set per state: 3 (-k3) 30 | 31 | Stats on memory usage (in Megabytes): 32 | 694.067 equivalent memory usage for states (stored*(State-vector + overhead)) 33 | 32768.000 memory used for hash array (-w38) 34 | 0.076 memory used for bit stack 35 | 0.534 memory used for DFS stack (-m10000) 36 | 32768.925 total actual memory usage 37 | 38 | 39 | unreached in proctype Proposer 40 | qpm.pml:115, state 58, "recs = (recs+1)" 41 | qpm.pml:140, state 81, "(1)" 42 | qpm.pml:148, state 87, "decided = ((p>>0)&((1<<4)-1))" 43 | qpm.pml:149, state 90, "(1)" 44 | qpm.pml:147, state 91, "((p==g))" 45 | qpm.pml:147, state 91, "else" 46 | qpm.pml:156, state 95, "p = g" 47 | (6 of 111 states) 48 | unreached in proctype Recorder 49 | (0 of 26 states) 50 | unreached in init 51 | (0 of 26 states) 52 | 53 | pan: elapsed time 3.96 seconds 54 | pan: rate 977570.71 states/second 55 | 56 | 57 | --- 58 | 5 steps: 59 | 60 | Depth= 1743 States= 2.13e+08 Transitions= 3.74e+08 Memory= 32769.120 t= 213 R= 1e+06 61 | 62 | (Spin Version 6.5.2 -- 6 December 2019) 63 | + Partial Order Reduction 64 | 65 | Bit statespace search for: 66 | never claim - (none specified) 67 | assertion violations + 68 | cycle checks - (disabled by -DSAFETY) 69 | invalid end states + 70 | 71 | State-vector 168 byte, depth reached 1743, errors: 0 72 | 2.1362823e+08 states, stored 73 | 1.6166948e+08 states, matched 74 | 3.7529772e+08 transitions (= stored+matched) 75 | 20 atomic steps 76 | 77 | hash factor: 1286.71 (best if > 100.) 78 | 79 | bits set per state: 3 (-k3) 80 | 81 | Stats on memory usage (in Megabytes): 82 | 38301.571 equivalent memory usage for states (stored*(State-vector + overhead)) 83 | 32768.000 memory used for hash array (-w38) 84 | 0.076 memory used for bit stack 85 | 0.534 memory used for DFS stack (-m10000) 86 | 32769.120 total actual memory usage 87 | 88 | 89 | unreached in proctype Proposer 90 | qpm.pml:148, state 87, "decided = ((p>>0)&((1<<4)-1))" 91 | qpm.pml:149, state 90, "(1)" 92 | qpm.pml:147, state 91, "((p==g))" 93 | qpm.pml:147, state 91, "else" 94 | qpm.pml:156, state 95, "p = g" 95 | (4 of 111 states) 96 | unreached in proctype Recorder 97 | (0 of 26 states) 98 | unreached in init 99 | (0 of 26 states) 100 | 101 | pan: elapsed time 213 seconds 102 | pan: rate 1001116.4 states/second 103 | 104 | --- 105 | 6 steps: 106 | 107 | Depth= 2323 States= 1.19e+09 Transitions= 2.14e+09 Memory= 32769.218 t= 1.11e+03 R= 1e+06 108 | 109 | (Spin Version 6.5.2 -- 6 December 2019) 110 | + Partial Order Reduction 111 | 112 | Bit statespace search for: 113 | never claim - (none specified) 114 | assertion violations + 115 | cycle checks - (disabled by -DSAFETY) 116 | invalid end states + 117 | 118 | State-vector 168 byte, depth reached 2323, errors: 0 119 | 1.1925986e+09 states, stored 120 | 9.5240049e+08 states, matched 121 | 2.1449991e+09 transitions (= stored+matched) 122 | 20 atomic steps 123 | 124 | hash factor: 230.487 (best if > 100.) 125 | 126 | bits set per state: 3 (-k3) 127 | 128 | Stats on memory usage (in Megabytes): 129 | 213821.928 equivalent memory usage for states (stored*(State-vector + overhead)) 130 | 32768.000 memory used for hash array (-w38) 131 | 0.076 memory used for bit stack 132 | 0.534 memory used for DFS stack (-m10000) 133 | 32769.218 total actual memory usage 134 | 135 | 136 | unreached in proctype Proposer 137 | qpm.pml:156, state 95, "p = g" 138 | (1 of 111 states) 139 | unreached in proctype Recorder 140 | (0 of 26 states) 141 | unreached in init 142 | (0 of 26 states) 143 | 144 | pan: elapsed time 1.11e+03 seconds 145 | pan: rate 1070699.5 states/second 146 | 147 | --- 148 | 7 steps: 149 | 150 | Depth= 3018 States= 3.57e+09 Transitions= 6.48e+09 Memory= 32769.315 t= 3.44e+03 R= 1e+06 151 | 152 | (Spin Version 6.5.2 -- 6 December 2019) 153 | + Partial Order Reduction 154 | 155 | Bit statespace search for: 156 | never claim - (none specified) 157 | assertion violations + 158 | cycle checks - (disabled by -DSAFETY) 159 | invalid end states + 160 | 161 | State-vector 168 byte, depth reached 3018, errors: 0 162 | 3.5701183e+09 states, stored 163 | 2.9060201e+09 states, matched 164 | 6.4761385e+09 transitions (= stored+matched) 165 | 20 atomic steps 166 | 167 | hash factor: 76.9941 (best if > 100.) 168 | 169 | bits set per state: 3 (-k3) 170 | 171 | Stats on memory usage (in Megabytes): 172 | 640089.267 equivalent memory usage for states (stored*(State-vector + overhead)) 173 | 32768.000 memory used for hash array (-w38) 174 | 0.076 memory used for bit stack 175 | 0.534 memory used for DFS stack (-m10000) 176 | 32769.315 total actual memory usage 177 | 178 | 179 | unreached in proctype Proposer 180 | (0 of 111 states) 181 | unreached in proctype Recorder 182 | (0 of 26 states) 183 | unreached in init 184 | (0 of 26 states) 185 | 186 | pan: elapsed time 3.44e+03 seconds 187 | pan: rate 1037038.3 states/second 188 | 189 | 190 | --- 191 | 8 steps: 192 | 193 | Depth= 3741 States= 1.55e+10 Transitions= 2.7e+10 Memory= 32769.511 t= 1.6e+04 R= 1e+06 194 | 195 | (Spin Version 6.5.2 -- 6 December 2019) 196 | + Partial Order Reduction 197 | 198 | Bit statespace search for: 199 | never claim - (none specified) 200 | assertion violations + 201 | cycle checks - (disabled by -DSAFETY) 202 | invalid end states + 203 | 204 | State-vector 168 byte, depth reached 3741, errors: 0 205 | 1.5529605e+10 states, stored 206 | 1.1502249e+10 states, matched 207 | 2.7031855e+10 transitions (= stored+matched) 208 | 20 atomic steps 209 | 210 | hash factor: 17.7003 (best if > 100.) 211 | 212 | bits set per state: 3 (-k3) 213 | 214 | Stats on memory usage (in Megabytes): 215 | 2784314.887 equivalent memory usage for states (stored*(State-vector + overhead)) 216 | 32768.000 memory used for hash array (-w38) 217 | 0.076 memory used for bit stack 218 | 0.534 memory used for DFS stack (-m10000) 219 | 32769.511 total actual memory usage 220 | 221 | 222 | unreached in proctype Proposer 223 | (0 of 111 states) 224 | unreached in proctype Recorder 225 | (0 of 26 states) 226 | unreached in init 227 | (0 of 26 states) 228 | 229 | pan: elapsed time 1.6e+04 seconds 230 | pan: rate 969476.33 states/second 231 | 232 | --- 233 | 10 steps: 234 | 235 | Depth= 4912 States= 6.2e+10 Transitions= 1.1e+11 Memory= 32769.706 t= 1.06e+05 R= 6e+05 236 | 237 | (Spin Version 6.5.2 -- 6 December 2019) 238 | + Partial Order Reduction 239 | 240 | Bit statespace search for: 241 | never claim - (none specified) 242 | assertion violations + 243 | cycle checks - (disabled by -DSAFETY) 244 | invalid end states + 245 | 246 | State-vector 168 byte, depth reached 4912, errors: 0 247 | 6.1979144e+10 states, stored 248 | 4.7682823e+10 states, matched 249 | 1.0966197e+11 transitions (= stored+matched) 250 | 20 atomic steps 251 | 252 | hash factor: 4.43501 (best if > 100.) 253 | 254 | bits set per state: 3 (-k3) 255 | 256 | Stats on memory usage (in Megabytes): 257 | 11112288.506 equivalent memory usage for states (stored*(State-vector + overhead)) 258 | 32768.000 memory used for hash array (-w38) 259 | 0.076 memory used for bit stack 260 | 0.534 memory used for DFS stack (-m10000) 261 | 1.014 other (proc and chan stacks) 262 | 32769.706 total actual memory usage 263 | 264 | 265 | unreached in proctype Proposer 266 | (0 of 111 states) 267 | unreached in proctype Recorder 268 | (0 of 26 states) 269 | unreached in init 270 | (0 of 26 states) 271 | 272 | pan: elapsed time 1.06e+05 seconds 273 | pan: rate 583824.35 states/second 274 | 275 | 276 | --- 277 | 11 steps: 278 | 279 | 280 | Depth= 5465 States= 7.1e+10 Transitions= 1.25e+11 Memory= 32769.804 t= 8.45e+04 R= 8e+05 281 | 282 | (Spin Version 6.5.2 -- 6 December 2019) 283 | + Partial Order Reduction 284 | 285 | Bit statespace search for: 286 | never claim - (none specified) 287 | assertion violations + 288 | cycle checks - (disabled by -DSAFETY) 289 | invalid end states + 290 | 291 | State-vector 168 byte, depth reached 5465, errors: 0 292 | 7.0950964e+10 states, stored 293 | 5.4281076e+10 states, matched 294 | 1.2523204e+11 transitions (= stored+matched) 295 | 20 atomic steps 296 | 297 | hash factor: 3.8742 (best if > 100.) 298 | 299 | bits set per state: 3 (-k3) 300 | 301 | Stats on memory usage (in Megabytes): 302 | 12720853.000 equivalent memory usage for states (stored*(State-vector + overhead)) 303 | 32768.000 memory used for hash array (-w38) 304 | 0.076 memory used for bit stack 305 | 0.534 memory used for DFS stack (-m10000) 306 | 1.111 other (proc and chan stacks) 307 | 32769.804 total actual memory usage 308 | 309 | 310 | unreached in proctype Proposer 311 | (0 of 111 states) 312 | unreached in proctype Recorder 313 | (0 of 26 states) 314 | unreached in init 315 | (0 of 26 states) 316 | 317 | pan: elapsed time 8.45e+04 seconds 318 | pan: rate 839344.87 states/second 319 | 320 | 321 | --- 322 | 12 steps: 323 | 324 | 325 | 326 | -------------------------------------------------------------------------------- /spin/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Analyze the consensus model using the Spin model checker. 3 | 4 | # Exhaustive verification. 5 | # MEMLIMIT is the memory-usage limit in megabytes. 6 | #spin -search -O2 -safety -DMEMLIM=60000 $1 7 | 8 | # Set maximum search depth (-m), making it an error to exceed this depth (-b). 9 | #spin -search -O2 -safety -DMEMLIM=60000 -m3870 -b $1 10 | 11 | # Exhaustive verification with state vector compression. 12 | #spin -search -O2 -safety -DMEMLIM=60000 -collapse $1 13 | #spin -search -O2 -safety -DMEMLIM=60000 -hc $1 14 | 15 | # Bitstate verification - most aggressive state compression. 16 | # -w defines the power of two of the hash table size in bits. 17 | # examples: -w28: 32MB, -w33: 1GB, -w38: 32GB 18 | #spin -search -O2 -safety -bitstate -w28 $1 19 | spin -search -O2 -safety -bitstate -w38 $1 20 | 21 | -------------------------------------------------------------------------------- /tools/qsc/.gitignore: -------------------------------------------------------------------------------- 1 | qsc 2 | -------------------------------------------------------------------------------- /tools/qsc/group.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "net/url" 7 | "strings" 8 | 9 | "github.com/bford/cofo/cri" 10 | 11 | "github.com/dedis/tlc/go/lib/cas" 12 | "github.com/dedis/tlc/go/lib/fs/casdir" 13 | "github.com/dedis/tlc/go/model/qscod/qscas" 14 | ) 15 | 16 | // Group represents a QSC consensus group. 17 | // XXX move to a suitable generic package. 18 | type group struct { 19 | qscas.Group 20 | } 21 | 22 | // Open a consensus group identified by the resource identifier ri. 23 | // Creates the group if create is true; otherwise opens existing group state. 24 | // 25 | // Supports composable resource identifier (CRI) as preferred group syntax 26 | // because CRIs cleanly suppport nesting of resource identifiers. 27 | // 28 | func (g *group) Open(ctx context.Context, ri string, create bool) error { 29 | 30 | // Parse the group resource identifier into individual members 31 | paths, err := parseGroupRI(ri) 32 | if err != nil { 33 | return err 34 | } 35 | n := len(paths) // number of members in the consensus group 36 | 37 | // Create a POSIX directory-based CAS interface to each store 38 | stores := make([]cas.Store, n) 39 | for i, path := range paths { 40 | st := &casdir.Store{} 41 | if err := st.Init(path, create, create); err != nil { 42 | return err 43 | } 44 | stores[i] = st 45 | } 46 | 47 | // Start a CAS-based consensus group across this set of stores, 48 | // with the default threshold configuration. 49 | // (XXX make this configurable eventually.) 50 | g.Group.Start(ctx, stores, -1) 51 | 52 | return nil 53 | } 54 | 55 | // Parse a group resource identifier into individual member identifiers. 56 | func parseGroupRI(group string) ([]string, error) { 57 | 58 | // Allow just '[...]' as a command-line shorthand for 'qsc[...]' 59 | if len(group) > 0 && group[0] == '[' { 60 | group = "qsc" + group 61 | } 62 | 63 | // Parsing it as an actual CRI/URI is kind of unnecessary so far, 64 | // but may get more interesting with query-string options and such. 65 | rawurl, err := cri.URI.From(group) 66 | if err != nil { 67 | return nil, err 68 | } 69 | //println("rawurl:", rawurl) 70 | url, err := url.Parse(rawurl) 71 | if err != nil { 72 | return nil, err 73 | } 74 | if url.Scheme != "qsc" { 75 | return nil, errors.New("consensus groups must use qsc scheme") 76 | } 77 | 78 | // Parse the nested member paths from the opaque string in the URL. 79 | str, path := url.Opaque, "" 80 | var paths []string 81 | for str != "" { 82 | if i := strings.IndexByte(str, ','); i >= 0 { 83 | path, str = str[:i], str[i+1:] 84 | } else { 85 | path, str = str, "" 86 | } 87 | paths = append(paths, path) 88 | } 89 | if len(paths) < 3 { 90 | return nil, errors.New( 91 | "consensus groups must have minimum three members") 92 | } 93 | 94 | return paths, nil 95 | } 96 | -------------------------------------------------------------------------------- /tools/qsc/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | //"flag" 6 | //"log" 7 | "context" 8 | "os" 9 | ) 10 | 11 | var verbose bool = false 12 | 13 | const usageStr = ` 14 | The qsc command provides tools using Que Sera Consensus (QSC). 15 | 16 | Usage: 17 | 18 | qsc [arguments] 19 | 20 | The types of consensus groups are: 21 | 22 | string Consensus on simple strings 23 | git Consensus on Git repositories 24 | hg Consensus on Mercurial repositories 25 | 26 | Run qsc help for commands that apply to each type. 27 | ` 28 | 29 | func usage(usageString string) { 30 | fmt.Println(usageString) 31 | os.Exit(1) 32 | } 33 | 34 | func main() { 35 | if len(os.Args) < 2 { 36 | usage(usageStr) 37 | } 38 | 39 | // Create a cancelable top-level context and cancel it when we're done, 40 | // to shut down asynchronous consensus access operations cleanly. 41 | ctx, cancel := context.WithCancel(context.Background()) 42 | defer cancel() 43 | 44 | // Parse consensus group kind 45 | switch os.Args[1] { 46 | case "string": 47 | stringCommand(ctx, os.Args[2:]) 48 | default: 49 | usage(usageStr) 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /tools/qsc/string.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "log" 7 | "os" 8 | ) 9 | 10 | func stringCommand(ctx context.Context, args []string) { 11 | if len(args) == 0 { 12 | usage(stringUsageStr) 13 | } 14 | switch args[0] { 15 | case "init": 16 | stringInitCommand(ctx, args[1:]) 17 | case "get": 18 | stringGetCommand(ctx, args[1:]) 19 | case "set": 20 | stringSetCommand(ctx, args[1:]) 21 | default: 22 | usage(stringUsageStr) 23 | } 24 | } 25 | 26 | const stringUsageStr = ` 27 | Usage: qsc string [arguments] 28 | 29 | The commands for string-value consensus groups are: 30 | 31 | init initialize a new consensus group 32 | get output the current consensus state as a quoted string 33 | set change the consensus state via atomic compare-and-set 34 | ` 35 | 36 | func stringInitCommand(ctx context.Context, args []string) { 37 | if len(args) != 1 { 38 | usage(stringInitUsageStr) 39 | } 40 | 41 | // Create the consensus group state on each member node 42 | var g group 43 | err := g.Open(ctx, args[0], true) 44 | if err != nil { 45 | log.Fatal(err) 46 | } 47 | } 48 | 49 | const stringInitUsageStr = ` 50 | Usage: qsc string init 51 | 52 | where specifies the consensus group 53 | as a composable resource identifier (CRI). 54 | For example: 55 | 56 | qsc git init qsc[host1:path1,host2:path2,host3:path3] 57 | ` 58 | 59 | func stringGetCommand(ctx context.Context, args []string) { 60 | if len(args) != 1 { 61 | usage(stringGetUsageStr) 62 | } 63 | 64 | // Open the file stores 65 | var g group 66 | err := g.Open(ctx, args[0], false) 67 | if err != nil { 68 | log.Fatal(err) 69 | } 70 | 71 | // Find a consensus view of the last known commit. 72 | ver, val, err := g.CompareAndSet(ctx, "", "") 73 | if err != nil { 74 | log.Fatal(err) 75 | } 76 | 77 | fmt.Printf("version %d state %q\n", ver, val) 78 | } 79 | 80 | const stringGetUsageStr = ` 81 | Usage: qsc string get 82 | 83 | where specifies the consensus group. 84 | Reads and prints the version number and string last committed. 85 | ` 86 | 87 | func stringSetCommand(ctx context.Context, args []string) { 88 | if len(args) != 3 { 89 | usage(stringSetUsageStr) 90 | } 91 | 92 | old := args[1] 93 | new := args[2] 94 | if new == "" { 95 | log.Fatal("The empty string is reserved for the starting state") 96 | } 97 | 98 | // Open the file stores 99 | var g group 100 | err := g.Open(ctx, args[0], false) 101 | if err != nil { 102 | log.Fatal(err) 103 | } 104 | 105 | // Invoke the request compare-and-set operation. 106 | ver, val, err := g.CompareAndSet(ctx, old, new) 107 | if err != nil { 108 | log.Fatal(err) 109 | } 110 | 111 | fmt.Printf("version %d state %q\n", ver, val) 112 | 113 | // Return success only if the next commit was what we wanted 114 | if val != new { 115 | os.Exit(1) 116 | } 117 | os.Exit(0) 118 | } 119 | 120 | const stringSetUsageStr = ` 121 | Usage: qsc string set 122 | 123 | where: 124 | specifies the consensus group 125 | is the expected existing value string 126 | is the new value to set if it hasn't yet changed from 127 | 128 | Prints the version number and string last committed, 129 | regardless of success or failure. 130 | ` 131 | --------------------------------------------------------------------------------