├── .gitignore
├── .travis.yml
├── README.md
├── erlang
└── model
│ ├── .gitignore
│ ├── README.md
│ ├── qsc.erl
│ └── run.sh
├── go
├── dist
│ ├── README.md
│ ├── causal.go
│ ├── dist_test.go
│ ├── doc.go
│ ├── node.go
│ ├── qsc.go
│ ├── set.go
│ ├── tlc.go
│ └── vec.go
├── lib
│ ├── backoff
│ │ ├── retry.go
│ │ ├── retry_test.go
│ │ └── rfq
│ │ │ └── doc.go
│ ├── cas
│ │ ├── cas.go
│ │ └── test
│ │ │ ├── cas.go
│ │ │ └── cas_test.go
│ ├── doc.go
│ └── fs
│ │ ├── atomic
│ │ ├── atomic.go
│ │ └── atomic_test.go
│ │ ├── casdir
│ │ └── state.go
│ │ └── verst
│ │ └── state.go
└── model
│ ├── README.md
│ ├── doc.go
│ ├── model_test.go
│ ├── node.go
│ ├── qsc.go
│ ├── qscod
│ ├── README.md
│ ├── core
│ │ ├── cli.go
│ │ └── test
│ │ │ ├── cli.go
│ │ │ └── cli_test.go
│ ├── encoding
│ │ └── enc.go
│ ├── fs
│ │ ├── casdir
│ │ │ └── cas_test.go
│ │ ├── simple
│ │ │ ├── store.go
│ │ │ └── store_test.go
│ │ └── store
│ │ │ ├── store.go
│ │ │ └── store_test.go
│ └── qscas
│ │ ├── doc.go
│ │ ├── group.go
│ │ ├── group_test.go
│ │ ├── rand.go
│ │ └── store.go
│ ├── quepaxa
│ ├── consensus.go
│ ├── isr.go
│ └── proposal.go
│ └── tlc.go
├── spin
├── README.md
├── qp.pml
├── qpm.pml
├── qsc.pml
├── results-qp.txt
├── results-qpm.txt
└── run.sh
└── tools
└── qsc
├── .gitignore
├── group.go
├── main.go
└── string.go
/.gitignore:
--------------------------------------------------------------------------------
1 | pan
2 | pan.*
3 | *.pml.trail
4 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | dist: bionic
2 |
3 | language: go
4 |
5 | go:
6 | - 1.12.x
7 |
8 | before_install:
9 | - sudo apt-get install -y spin
10 |
11 | install:
12 | - go get -u golang.org/x/lint/golint
13 |
14 | script:
15 | - go vet ./...
16 | - if [ "$( gofmt -l . )" ]; then gofmt -d; exit 1; fi
17 | - golint -set_exit_status ./...
18 | - go test ./...
19 | - cd spin; ./run.sh
20 |
21 | notifications:
22 | email: false
23 |
24 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | This repository contains multiple prototype implementations of
3 | Threshold Logical Clocks (TLC) and Que Sera Consensus (QSC),
4 | as described in the following papers:
5 |
6 | * [Threshold Logical Clocks for Asynchronous Distributed Coordination and Consensus](https://arxiv.org/abs/1907.07010)
7 | * [Que Sera Consensus: Simple Asynchronous Agreement with Private Coins and Threshold Logical Clocks](https://arxiv.org/abs/2003.02291)
8 |
9 | The following prototype implementations of TLC and QSC are available
10 | in multiple languages:
11 |
12 | * [erlang/model](erlang/model/) contains a minimalistic model implementation
13 | of the QSC, TLCB, and TLCR algorithms detailed in the
14 | [new QSC preprint](https://arxiv.org/abs/2003.02291).
15 | This model implements QSC using Erlang processes and communication
16 | on a single machine for illustrative simplicity, although
17 | [distributed Erlang](https://erlang.org/doc/reference_manual/distributed.html)
18 | should make it straightforward to extend this model
19 | to true distributed consensus.
20 | Erlang's [selective receive](https://ndpar.blogspot.com/2010/11/erlang-explained-selective-receive.html)
21 | is particularly well-suited to implementing TLCR concisely.
22 | The model consists of only 73 lines of code
23 | as measured by [cloc](https://github.com/AlDanial/cloc),
24 | including test code,
25 | or only 37 lines comprising the consensus algorithm alone.
26 |
27 | * [go/model](go/model/) contains a minimalistic model implementation in Go
28 | of TLC and QSC as described in the
29 | [original TLC preprint](https://arxiv.org/abs/1907.07010).
30 | This model illustrates the key concepts
31 | using goroutines and shared memory communication for simplicity.
32 | It is not useful in an actual distributed context,
33 | but being less than 200 code lines long
34 | as measured by [cloc](https://github.com/AlDanial/cloc),
35 | it is ideal for studying and understanding TLC and QSC.
36 |
37 | * [go/model/qscod](go/model/qscod/)
38 | contains a model implementation in Go of QSCOD,
39 | the client-driven "on-demand" consensus algorithm outlined in the
40 | [new QSC preprint](https://arxiv.org/abs/2003.02291).
41 | This formulation of QSC consumes no bandwidth or computation
42 | when there is no work to be done (hence on-demand),
43 | and incurs only O(n2) communication complexity
44 | per client-driven agreement.
45 |
46 | * [go/dist](go/dist/) contains a simple but working
47 | "real" distributed implementation of TLC and QSC in Go
48 | for a fail-stop (Paxos-like) threat model.
49 | It uses TCP, TLS encryption and authentication,
50 | and Go's native Gob encoding for inter-node communication.
51 | At less than 1000 code lines long
52 | as measured by [cloc](https://github.com/AlDanial/cloc),
53 | it is still probably one of the simplest implementations
54 | of asynchronous consensus around.
55 |
56 | * [spin](spin/) contains a simple Promela model of the core of TLC and QSC
57 | for the [Spin model checker](https://spinroot.com/spin/whatispin.html).
58 | Although this implementation models TLC and QSC only at a
59 | very high, abstract level, it captures the basic logic enough
60 | to lend confidence to the correctness of the algorithm.
61 |
62 | All of this code is still extremely early and experimental;
63 | use at your own risk.
64 |
65 | [](https://travis-ci.com/dedis/tlc)
66 |
67 |
--------------------------------------------------------------------------------
/erlang/model/.gitignore:
--------------------------------------------------------------------------------
1 | *.beam
2 |
--------------------------------------------------------------------------------
/erlang/model/README.md:
--------------------------------------------------------------------------------
1 | This directory contains a minimal implementation of
2 | Que Sera Consensus (QSC) built on Threshold Logical Clocks (TLC)
3 | in [Erlang](https://www.erlang.org)
4 | for fail-stop, non-Byzantine environments.
5 | This model implements the QSC, TLCB, and TLCR algorithms detailed in the
6 | [new QSC preprint](https://arxiv.org/abs/2003.02291)
7 | in only 37 lines of code representing the actual algorithm.
8 |
9 | For background information on QSC and TLC,
10 | and other model implementations in several languages, please see the
11 | [top level of this repository](https://github.com/dedis/tlc/).
12 |
--------------------------------------------------------------------------------
/erlang/model/qsc.erl:
--------------------------------------------------------------------------------
1 | -module(qsc).
2 | -export([qsc/1, test/0]).
3 |
4 | % Node configuration is a tuple defined as a record.
5 | % nn: node number from 1..len(pids)
6 | % tr: receive threshold
7 | % tb: broadcast threshold
8 | % ts: spread threshold
9 | % pids: list of process IDs of all nodes
10 | % steps: maximum number of time steps to run, nil to run forever
11 | % choose: function choose(Config, Step) -> Msg to choose application message
12 | % random: function random() -> Value to choose a random priority value
13 | % deliver: function deliver(History) to deliver a committed history
14 | -record(config, {nn, tr, tb, ts, pids, steps, choose, random, deliver}).
15 |
16 | % A history is a record representing the most recent in a chain.
17 | -record(hist, {step, nn, msg, pri, pred}).
18 |
19 | % qsc(C) -> (never returns)
20 | % Implements Que Sera Co nsensus (QSC) atop TLCB and TLCR.
21 | qsc(C) -> qsc(C, 1, #hist{step=0}). % start at step 1 with placeholder pred
22 | qsc(#config{steps=Max}, S0, _) when S0 > Max -> {}; % stop after Max steps
23 | qsc(#config{nn=I, choose=Ch, random=Rv, deliver=D} = C, S0, H0) ->
24 | H1 = #hist{step=S0, nn=I, msg=Ch(C, S0), pri=Rv(), pred=H0},
25 | {S1, R1, B1} = tlcb(C, S0, H1), % Try to broadcast (confirm) proposal
26 | {H2, _} = best(B1), % Choose some best eligible proposal
27 | {S2, R2, B2} = tlcb(C, S1, H2), % Re-broadcast it to reconfirm proposal
28 | {Hn, _} = best(R2), % Choose best eligible for next round
29 | {H3, Unique} = best(R1), % What is the best potential history?
30 | Final = lists:member(Hn, B2) and (Hn == H3) and Unique,
31 | if Final -> D(Hn), qsc(C, S2, Hn); % Deliver history Hn
32 | true -> qsc(C, S2, Hn) % Just proceed to next consensus round
33 | end.
34 |
35 | % best(L) -> {B, U}
36 | % Find and return the best (highest-priority) history B in a nonempty list L,
37 | % and a flag U indicating whether B is uniquely best (highest priority) in L.
38 | best([H]) -> {H, true}; % trivial singleton case
39 | best(L) ->
40 | Compare = fun(#hist{pri=AR}, #hist{pri=BR}) -> AR >= BR end,
41 | [#hist{pri=BR} = B, #hist{pri=NR} | _] = lists:sort(Compare, L),
42 | {B, (BR /= NR)}.
43 |
44 |
45 | % tlcb(C, S, H) -> {S, R, B}
46 | % Implements the TLCB algorithm for full-spread synchronous broadcast.
47 | tlcb(#config{ts=Ts} = C, S0, H) ->
48 | {S1, R1, _} = tlcr(C, S0, H), % Step 1: broadcast history H
49 | {S2, R2, _} = tlcr(C, S1, R1), % Step 2: re-broadcast list we received
50 | R = sets:to_list(sets:union([sets:from_list(L) || L <- [R1 | R2]])),
51 | B = [Hc || Hc <- R, count(R2, Hc) >= Ts],
52 | {S2, R, B}. % New state, receive and broadcast sets
53 |
54 | % count(LL, H) -> N
55 | % Return N the number of lists in list-of-lists LL that include history H.
56 | count(LL, H) -> length([L || L <- LL, lists:member(H, L)]).
57 |
58 |
59 | % tlcr(C, S, M) -> {S, R, nil}
60 | % Implements the TLCR algorithm for receive-threshold synchronous broadcast.
61 | tlcr(#config{pids=Pids} = C, S, M) ->
62 | [P ! {S, M} || P <- Pids], % broadcast next message
63 | tlcr_wait(C, S, []). % wait for receive threshold
64 | tlcr_wait(#config{tr=Tr} = C, S, R) when length(R) < Tr ->
65 | receive {RS, RM} when RS == S -> tlcr_wait(C, S, [RM | R]);
66 | {RS, _} when RS < S -> tlcr_wait(C, S, R) % drop old msgs
67 | end; % when RS > S message stays in the inbox to be received later
68 | tlcr_wait(_, S, R) -> {S+1, R, nil}.
69 |
70 |
71 | % Run a test-case configured for a given number of potentially-failing nodes F,
72 | % then signal Parent process when done.
73 | test_run(F, Steps) ->
74 | % Generate a standard valid configuration from number of failures F.
75 | N = 3*F, Tr = 2*F, Tb = F, Ts = F+1,
76 | io:fwrite("Test N=~p F=~p~n", [N, F]),
77 |
78 | % Function to choose message for node I to propose at TLC time-step S.
79 | Choose = fun(#config{nn=I}, S) -> {msg, S, I} end,
80 |
81 | % Choose a random value to attach to a proposal in time-step S.
82 | % This low-entropy random distribution is intended only for testing,
83 | % so as to ensure a significant rate of ties for best priority.
84 | % Production code should use high-entropy cryptographic randomness for
85 | % maximum efficiency and strength against intelligent DoS attackers.
86 | Random = fun() -> rand:uniform(N) end,
87 |
88 | % Spawn a process to receive and consistency-check committed histories.
89 | Checker = spawn(fun() -> test_checker(#hist{step=0}) end),
90 |
91 | % The nodes will "deliver" histories by sending them back to us.
92 | Deliver = fun(H) -> Checker ! {check, H} end,
93 |
94 | % Launch a process representing each of the N nodes.
95 | Self = self(),
96 | Pids = [spawn(fun() -> test_node(Self) end) || _ <- lists:seq(1, N)],
97 |
98 | % Send each node its complete configuration record to get it started.
99 | C = #config{ tr = Tr, tb = Tb, ts = Ts, pids = Pids, steps = Steps,
100 | choose = Choose, random = Random, deliver = Deliver},
101 | [lists:nth(I, Pids) ! C#config{nn=I} || I <- lists:seq(1, N)],
102 |
103 | % Wait until all nodes run the designated number of time steps.
104 | [test_wait(I) || I <- lists:seq(1, N)],
105 | Checker ! {stop}. % Terminate our checker process
106 |
107 | % Receive a node configuration, run a QSC node simulation with it,
108 | % then send a completion signal to our parent process.
109 | test_node(Parent) ->
110 | receive #config{} = C -> qsc(C), Parent ! {done, C#config.nn} end.
111 |
112 | % Wait to receive a signal that node I is finished.
113 | test_wait(I) -> receive {done, I} -> {} end.
114 |
115 | % test_checker() -> {}
116 | % Receive committed histories from all nodes and consistency-check them
117 | test_checker(Hp) ->
118 | receive {check, H} ->
119 | %io:fwrite("committed ~P~n", [H, 8]),
120 | test_checker(test_check(Hp, H));
121 | {stop} -> {}
122 | end.
123 |
124 | % test_check(A, B) -> H
125 | % Check two histories A and B for consistency, and return the longer one.
126 | test_check(#hist{step=AC,pred=AP} = A, #hist{step=BC} = B) when AC > BC ->
127 | test_check(AP, B), A; % compare shorter prefix of A with B
128 | test_check(#hist{step=AC} = A, #hist{step=BC,pred=BP} = B) when BC > AC ->
129 | test_check(A, BP), B; % compare A with shorter prefix of B
130 | test_check(A, B) when A == B -> A;
131 | test_check(A, B) -> erlang:error({inconsistency, A, B}).
132 |
133 | % Run QSC and TLC through a test suite.
134 | test() ->
135 | [test_run(F, 1000) || F <- [1,2,3,4,5]], % simple test suite
136 | io:fwrite("Tests completed~n").
137 |
138 |
--------------------------------------------------------------------------------
/erlang/model/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | erl -make && erl -noshell -run qsc test -run init stop
3 |
--------------------------------------------------------------------------------
/go/dist/README.md:
--------------------------------------------------------------------------------
1 | This Go package provides a simple but "real" distributed implementation of
2 | Que Sera Consensus (QSC) built on Threshold Logical Clocks (TLC)
3 | for fail-stop, non-Byzantine environments.
4 | For background information on QSC and TLC,
5 | and other model implementations in several languages, please see the
6 | [top level of this repository](https://github.com/dedis/tlc/).
7 | For more details on this package see the code and its
8 | [GoDoc documentation](https://godoc.org/github.com/dedis/tlc/go/dist).
9 |
--------------------------------------------------------------------------------
/go/dist/causal.go:
--------------------------------------------------------------------------------
1 | package dist
2 |
3 | // Broadcast a copy of our current message template to all nodes.
4 | func (n *Node) broadcastCausal(msg *Message) {
5 |
6 | //println(n.self, n.tmpl.Step, "broadcastCausal",
7 | // "mat", len(n.mat))
8 |
9 | // Assign the new message a sequence number
10 | msg.Seq = len(n.seqLog[n.self]) // Assign sequence number
11 | msg.Vec = n.mat[n.self].copy() // Include vector time update
12 | n.logCausal(n.self, msg) // Add msg to our log
13 | //println(n.self, n.tmpl.Step, "broadcastCausal step", msg.Step,
14 | // "typ", msg.Typ, "seq", msg.Seq,
15 | // "vec", fmt.Sprintf("%v", msg.Vec))
16 |
17 | // We always receive our own message first.
18 | n.receiveTLC(msg)
19 |
20 | // Send it to all other peers.
21 | for dest := range n.peer {
22 | if dest != n.self {
23 | n.sendCausal(dest, msg)
24 | }
25 | }
26 | }
27 |
28 | // Log a peer's message, either our own (just sent)
29 | // or another node's (received and ready to be delivered).
30 | func (n *Node) logCausal(peer int, msg *Message) {
31 |
32 | // Update peer's matrix clock and our record of what it saw by msg
33 | for i := range n.peer {
34 | //println(i, "mat", len(n.mat), "vec", len(msg.Vec))
35 | for n.mat[peer][i] < msg.Vec[i] {
36 | n.sawCausal(peer, n.seqLog[i][n.mat[peer][i]])
37 | n.mat[peer][i]++
38 | }
39 | }
40 | n.sawCausal(peer, msg) // msg has been seen by the peer that sent it
41 | n.sawCausal(n.self, msg) // and now we've seen the message too
42 |
43 | n.seqLog[peer] = append(n.seqLog[peer], msg) // log this msg
44 | n.mat[n.self][peer] = len(n.seqLog[peer]) // update our vector time
45 | if len(n.seqLog[peer]) != msg.Seq+1 { // sanity check
46 | panic("out of sync")
47 | }
48 | }
49 |
50 | // Record the fact that a given peer is now known to have seen a given message.
51 | // For Wit messages, record the fact that the proposal was threshold witnessed.
52 | func (n *Node) sawCausal(peer int, msg *Message) {
53 | n.saw[peer].add(msg)
54 | if msg.Typ == Wit {
55 | prop := n.seqLog[msg.From][msg.Prop]
56 | if prop.Typ != Prop {
57 | panic("not a proposal!")
58 | }
59 | n.wit[peer].add(prop)
60 | }
61 | }
62 |
63 | // Transmit a message to a particular node.
64 | func (n *Node) sendCausal(dest int, msg *Message) {
65 | //println(n.self, n.tmpl.Step, "sendCausal to", dest, "typ", msg.Typ,
66 | // "seq", msg.Seq)
67 | n.peer[dest].Send(msg)
68 | }
69 |
70 | // Receive a possibly out-of-order message from the network.
71 | // Enqueue it and actually deliver messages as soon as we can.
72 | func (n *Node) receiveCausal(msg *Message) {
73 |
74 | // Unicast acknowledgments don't get sequence numbers or reordering.
75 | if msg.Typ == Ack {
76 | n.receiveTLC(msg) // Just send it up the stack
77 | return
78 | }
79 |
80 | // Ignore duplicate message deliveries
81 | if msg.Seq < n.mat[n.self][msg.From] {
82 | println(n.self, n.tmpl.Step, "duplicate message from", msg.From,
83 | "seq", msg.Seq)
84 | panic("XXX")
85 | }
86 |
87 | // Enqueue broadcast message for delivery in causal order.
88 | //println(n.self, n.tmpl.Step, "receiveCausal from", msg.From,
89 | // "type", msg.Typ, "seq", msg.Seq,
90 | // "vec", fmt.Sprintf("%v", msg.Vec))
91 | //if len(n.oom[msg.From]) <= msg.Seq - n.mat[n.self][msg.From] - 1000 {
92 | // panic("huge jump")
93 | //}
94 | for len(n.oom[msg.From]) <= msg.Seq-n.mat[n.self][msg.From] {
95 | n.oom[msg.From] = append(n.oom[msg.From], nil)
96 | }
97 | n.oom[msg.From][msg.Seq-n.mat[n.self][msg.From]] = msg
98 |
99 | // Deliver whatever messages we can consistently with causal order.
100 | for progress := true; progress; {
101 | progress = false
102 | for i := range n.peer {
103 | progress = progress || n.deliverCausal(i)
104 | }
105 | }
106 | }
107 |
108 | // Try to deliver out-of-order messages held from a given peer.
109 | // Returns true if we made progress, false if nothing to do for this peer.
110 | func (n *Node) deliverCausal(peer int) bool {
111 | if len(n.oom[peer]) == 0 || n.oom[peer][0] == nil ||
112 | !n.oom[peer][0].Vec.le(n.mat[n.self]) {
113 | return false
114 | }
115 |
116 | // Log the message now that it's in causal order.
117 | //println(n.self, n.tmpl.Step, "enqueueCausal",
118 | // "deliver type", msg.Typ,
119 | // "seq", msg.Seq, "#oom", len(n.oom[i]))
120 | msg := n.oom[peer][0]
121 | n.logCausal(peer, msg)
122 |
123 | // Remove it from this peer's out-of-order message queue.
124 | n.oom[peer] = n.oom[peer][1:]
125 |
126 | // Deliver the message to upper layers.
127 | n.receiveTLC(msg)
128 |
129 | return true // made progress
130 | }
131 |
132 | // Initialize the causality and higher layer state for a node.
133 | func (n *Node) initCausal() {
134 | n.mat = make([]vec, len(n.peer))
135 | n.oom = make([][]*Message, len(n.peer))
136 | n.seqLog = make([][]*Message, len(n.peer))
137 | n.saw = make([]set, len(n.peer))
138 | n.wit = make([]set, len(n.peer))
139 | for i := range n.peer {
140 | n.mat[i] = make(vec, len(n.peer))
141 | n.saw[i] = make(set)
142 | n.wit[i] = make(set)
143 | }
144 |
145 | n.initTLC()
146 | }
147 |
--------------------------------------------------------------------------------
/go/dist/dist_test.go:
--------------------------------------------------------------------------------
1 | package dist
2 |
3 | import (
4 | "bytes"
5 | "context"
6 | "crypto/ecdsa"
7 | "crypto/elliptic"
8 | crand "crypto/rand"
9 | "crypto/tls"
10 | "crypto/x509"
11 | "encoding/gob"
12 | "encoding/json"
13 | "encoding/pem"
14 | "fmt"
15 | "io"
16 | "math/big"
17 | mrand "math/rand"
18 | "net"
19 | "os"
20 | "os/exec"
21 | "sync"
22 | "testing"
23 | "time"
24 | )
25 |
26 | // MaxSteps to take
27 | var MaxSteps int
28 |
29 | // Maximum random delays to add to message deliveries for testing
30 | var MaxSleep time.Duration
31 |
32 | // Whether to run consensus among multiple separate processes
33 | var MultiProcess = true
34 |
35 | // Whether to use TLS encryption and authentication atop TCP
36 | var UseTLS = true
37 |
38 | // Information about each virtual host passed to child processes via JSON
39 | type testHost struct {
40 | Name string // Virtual host name
41 | Addr string // Host IP address and TCP port
42 | Cert []byte // Host's self-signed x509 certificate
43 | }
44 |
45 | // Configuration information each child goroutine or process needs to launch
46 | type testConfig struct {
47 | Self int // Which participant number we are
48 | Nnodes int // Total number of participants
49 | HostName string // This child's virtual hostname
50 |
51 | MaxSteps int
52 | MaxTicket int32
53 | MaxSleep time.Duration
54 | }
55 |
56 | func TestQSC(t *testing.T) {
57 |
58 | testCase(t, 1, 1, 10000, 0, 0) // Trivial case: 1 of 1 consensus!
59 | testCase(t, 2, 2, 10000, 0, 0) // Another trivial case: 2 of 2
60 |
61 | testCase(t, 2, 3, 1000, 0, 0) // Standard f=1 case
62 | testCase(t, 3, 5, 1000, 0, 0) // Standard f=2 case
63 | testCase(t, 4, 7, 100, 0, 0) // Standard f=3 case
64 | testCase(t, 5, 9, 100, 0, 0) // Standard f=4 case
65 | testCase(t, 11, 21, 20, 0, 0) // Standard f=10 case
66 | //testCase(t, 101, 201, 10, 0, 0) // Standard f=100 case - blows up
67 |
68 | testCase(t, 3, 3, 100, 0, 0) // Larger-than-minimum thresholds
69 | testCase(t, 6, 7, 100, 0, 0)
70 | testCase(t, 9, 10, 100, 0, 0)
71 |
72 | // Test with low-entropy tickets:
73 | // commit success rate will be bad, but still must remain safe!
74 | testCase(t, 2, 3, 10, 1, 0) // Limit case: will never commit
75 | testCase(t, 2, 3, 100, 2, 0) // Extreme low-entropy: rarely commits
76 | testCase(t, 2, 3, 100, 3, 0) // A bit better bit still bad...
77 |
78 | // Test with random delays inserted
79 | testCase(t, 2, 3, 100, 0, 1*time.Nanosecond)
80 | testCase(t, 2, 3, 100, 0, 1*time.Microsecond)
81 | testCase(t, 2, 3, 100, 0, 1*time.Millisecond)
82 | testCase(t, 4, 7, 100, 0, 1*time.Microsecond)
83 | testCase(t, 4, 7, 100, 0, 1*time.Millisecond)
84 | }
85 |
86 | func testCase(t *testing.T, threshold, nnodes, maxSteps, maxTicket int,
87 | maxSleep time.Duration) {
88 |
89 | if maxTicket == 0 { // Default to moderate-entropy tickets
90 | maxTicket = 10 * nnodes
91 | }
92 |
93 | desc := fmt.Sprintf("T=%v,N=%v,Steps=%v,Tickets=%v,Sleep=%v",
94 | threshold, nnodes, maxSteps, maxTicket, maxSleep)
95 | t.Run(desc, func(t *testing.T) {
96 |
97 | // Configure and run the test case.
98 | MaxSteps = maxSteps
99 | MaxTicket = int32(maxTicket)
100 | MaxSleep = maxSleep
101 | Threshold = threshold
102 |
103 | testExec(t, threshold, nnodes)
104 | })
105 | }
106 |
107 | func testExec(t *testing.T, threshold, nnodes int) {
108 |
109 | // Create a cancelable context in which to execute helper processes
110 | ctx, cancel := context.WithCancel(context.Background())
111 | defer cancel() // kill child processes
112 |
113 | // Create a public/private keypair and self-signed cert for each node.
114 | conf := make([]testConfig, nnodes) // each node's config information
115 | for i := range conf {
116 | conf[i].Self = i
117 | conf[i].Nnodes = nnodes
118 | conf[i].HostName = fmt.Sprintf("host%v", i)
119 | conf[i].MaxSteps = MaxSteps
120 | conf[i].MaxTicket = MaxTicket
121 | conf[i].MaxSleep = MaxSleep
122 | }
123 |
124 | // Start the per-node child processes,
125 | // and gather network addresses and certificates from each one.
126 | childGroup := &sync.WaitGroup{}
127 | host := make([]testHost, nnodes)
128 | enc := make([]*json.Encoder, nnodes)
129 | dec := make([]*json.Decoder, nnodes)
130 | for i := range host {
131 |
132 | childGroup.Add(1)
133 | childIn, childOut := testExecChild(ctx, &conf[i], t, childGroup)
134 |
135 | // We'll communicate with the child via JSON-encoded stdin/out
136 | enc[i] = json.NewEncoder(childIn)
137 | dec[i] = json.NewDecoder(childOut)
138 |
139 | // Send the child its configuration information
140 | if err := enc[i].Encode(&conf[i]); err != nil {
141 | t.Fatalf("Encode: " + err.Error())
142 | }
143 |
144 | // Get the network address the child is listening on
145 | if err := dec[i].Decode(&host[i]); err != nil {
146 | t.Fatalf("Decode: %v", err.Error())
147 | }
148 | if host[i].Name != conf[i].HostName { // sanity check
149 | panic("hostname mismatch")
150 | }
151 | //println("child", i, "listening on", host[i].Addr)
152 | }
153 |
154 | // Send the array of addresses to all the child processes
155 | for i := range host {
156 | if err := enc[i].Encode(host); err != nil {
157 | t.Fatalf("Encode: " + err.Error())
158 | }
159 | }
160 |
161 | // Wait and collect the consensus histories of each child
162 | hist := make([][]choice, nnodes)
163 | for i := range host {
164 | if err := dec[i].Decode(&hist[i]); err != nil {
165 | t.Fatalf("Decode: %v", err.Error())
166 | }
167 | }
168 |
169 | // Let all the children know they can exit
170 | for i := range host {
171 | if err := enc[i].Encode(struct{}{}); err != nil {
172 | t.Fatalf("Encode: " + err.Error())
173 | }
174 | }
175 |
176 | // Wait for the helper processes to complete
177 | childGroup.Wait()
178 | }
179 |
180 | // Exec a child as a separate process.
181 | func testExecChild(ctx context.Context, conf *testConfig, t *testing.T,
182 | grp *sync.WaitGroup) (io.Writer, io.Reader) {
183 |
184 | if !MultiProcess {
185 | // Run a child as a separate goroutine in the same process.
186 | childInRd, childInWr := io.Pipe()
187 | childOutRd, childOutWr := io.Pipe()
188 | go func() {
189 | testChild(childInRd, childOutWr)
190 | grp.Done()
191 | }()
192 | return childInWr, childOutRd
193 | }
194 |
195 | // Run the child as a separate helper process
196 | cmd := exec.CommandContext(ctx, os.Args[0],
197 | "-test.run=TestHelper")
198 | cmd.Env = append(os.Environ(), "TLC_HELPER=1")
199 |
200 | // Arrange to send standard input to the child via pipe
201 | childIn, err := cmd.StdinPipe()
202 | if err != nil {
203 | t.Fatalf("StdinPipe: %v", err.Error())
204 | }
205 |
206 | // Copy child's standard output to parent via pipe
207 | childOut, err := cmd.StdoutPipe()
208 | if err != nil {
209 | t.Fatalf("StdoutPipe: %v", err.Error())
210 | }
211 |
212 | // Copy child's standard error to parent's standard error
213 | childErr, err := cmd.StderrPipe()
214 | if err != nil {
215 | t.Fatalf("StderrPipe: %v", err.Error())
216 | }
217 | go copyAll(os.Stderr, childErr)
218 |
219 | // Start the command running
220 | if err := cmd.Start(); err != nil {
221 | t.Fatalf("cmd.Start: %v", err.Error())
222 | }
223 |
224 | // Arrange to signal the provided WaitGroup when child terminates
225 | go func() {
226 | if err := cmd.Wait(); err != nil {
227 | t.Fatalf("cmd.Wait: %v", err.Error())
228 | }
229 | grp.Done()
230 | }()
231 |
232 | return childIn, childOut
233 | }
234 |
235 | func TestHelper(t *testing.T) {
236 |
237 | if os.Getenv("TLC_HELPER") == "" {
238 | return // Do nothing except when called as a helper
239 | }
240 |
241 | // Exit with error status if anything goes wrong.
242 | defer os.Exit(1)
243 |
244 | testChild(os.Stdin, os.Stdout)
245 | os.Exit(0)
246 | }
247 |
248 | func copyAll(dst io.Writer, src io.Reader) {
249 | if _, err := io.Copy(dst, src); err != nil {
250 | println("Copy: " + err.Error())
251 | }
252 | }
253 |
254 | func createCert(hostName string) (certPemBytes, privPemBytes []byte) {
255 |
256 | priv, err := ecdsa.GenerateKey(elliptic.P256(), crand.Reader)
257 | if err != nil {
258 | panic("createCert: " + err.Error())
259 | }
260 |
261 | notBefore := time.Now() // valid starting now
262 | notAfter := notBefore.Add(365 * 24 * time.Hour) // valid for a year
263 | tmpl := x509.Certificate{
264 | NotBefore: notBefore,
265 | NotAfter: notAfter,
266 | IsCA: true,
267 | KeyUsage: x509.KeyUsageKeyEncipherment |
268 | x509.KeyUsageDigitalSignature |
269 | x509.KeyUsageCertSign,
270 | ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth,
271 | x509.ExtKeyUsageClientAuth},
272 | BasicConstraintsValid: true,
273 | DNSNames: []string{hostName},
274 | SerialNumber: big.NewInt(1),
275 | }
276 | certb, err := x509.CreateCertificate(crand.Reader, &tmpl, &tmpl,
277 | &priv.PublicKey, priv)
278 | if err != nil {
279 | panic("createCert: " + err.Error())
280 | }
281 |
282 | cert, err := x509.ParseCertificate(certb)
283 | if err != nil {
284 | panic("ParseCertificate: " + err.Error())
285 | }
286 |
287 | if err := cert.VerifyHostname(hostName); err != nil {
288 | panic("VerifyHostname: " + err.Error())
289 | }
290 |
291 | // Sanity-check the certificate just to make sure it actually works.
292 | pool := x509.NewCertPool()
293 | pool.AddCert(cert)
294 | vo := x509.VerifyOptions{DNSName: hostName, Roots: pool}
295 | if _, err := cert.Verify(vo); err != nil {
296 | panic("Verify: " + err.Error())
297 | }
298 | //println("verified for", hostName)
299 |
300 | // PEM-encode our certificate
301 | certPem := bytes.NewBuffer(nil)
302 | if err := pem.Encode(certPem, &pem.Block{Type: "CERTIFICATE",
303 | Bytes: certb}); err != nil {
304 | panic("pem.Encode: " + err.Error())
305 | }
306 |
307 | // PEM-encode our private key
308 | privb, err := x509.MarshalECPrivateKey(priv)
309 | if err != nil {
310 | panic("x509.MarshalECPrivateKey: " + err.Error())
311 | }
312 | privPem := bytes.NewBuffer(nil)
313 | if err := pem.Encode(privPem, &pem.Block{Type: "EC PRIVATE KEY",
314 | Bytes: privb}); err != nil {
315 | panic("pem.Encode: " + err.Error())
316 | }
317 |
318 | return certPem.Bytes(), privPem.Bytes()
319 | }
320 |
321 | func testChild(in io.Reader, out io.Writer) {
322 |
323 | // We'll use JSON over stdin/stdout to coordinate with our parent.
324 | dec := json.NewDecoder(in)
325 | enc := json.NewEncoder(out)
326 |
327 | // Get the child process config information via JSON
328 | conf := testConfig{}
329 | if err := dec.Decode(&conf); err != nil {
330 | panic("Decode: " + err.Error())
331 | }
332 | self := conf.Self
333 | MaxSteps = conf.MaxSteps
334 | MaxTicket = conf.MaxTicket
335 | MaxSleep = conf.MaxSleep
336 |
337 | // Initialize the node appropriately
338 | //println("self", self, "nnodes", conf.Nnodes)
339 | n := &Node{}
340 | n.init(self, make([]peer, conf.Nnodes))
341 | n.mutex.Lock() // keep node's TLC state locked until fully set up
342 |
343 | // Create a TLS/TCP listen socket for this child
344 | tcpl, err := net.Listen("tcp", "")
345 | if err != nil {
346 | panic("Listen: " + err.Error())
347 | }
348 |
349 | // Create an x509 certificate and private key for this child
350 | //println(self, "createCert for", conf.HostName)
351 | certb, privb := createCert(conf.HostName)
352 |
353 | // Create a TLS certificate from it
354 | tlscert, err := tls.X509KeyPair(certb, privb)
355 | if err != nil {
356 | panic("tls.X509KeyPair: " + err.Error())
357 | }
358 |
359 | // Report our network address and certificate to the parent process
360 | myHost := testHost{
361 | Name: conf.HostName,
362 | Addr: tcpl.Addr().String(),
363 | Cert: certb,
364 | }
365 | if err := enc.Encode(myHost); err != nil {
366 | panic("Encode: " + err.Error())
367 | }
368 |
369 | // Get the list of all host names, addresses, and certs from the parent
370 | host := []testHost{}
371 | if err := dec.Decode(&host); err != nil {
372 | panic("Decode: " + err.Error())
373 | }
374 |
375 | // Create a certificate pool containing all nodes' certificates
376 | pool := x509.NewCertPool()
377 | for i := range host {
378 | if !pool.AppendCertsFromPEM(host[i].Cert) {
379 | panic("failed to append cert from " + host[i].Name)
380 | }
381 | }
382 |
383 | //println("hostName", conf.HostName, "pool", len(pool.Subjects()))
384 |
385 | // Listen and accept TCP/TLS connections
386 | donegrp := &sync.WaitGroup{}
387 | go func() {
388 | for {
389 | // Accept a TCP connection
390 | tcpc, err := tcpl.Accept()
391 | if err != nil {
392 | panic("Accept: " + err.Error())
393 | }
394 |
395 | // Launch a goroutine to process it
396 | donegrp.Add(1)
397 | go n.acceptNetwork(tcpc, &tls.Config{
398 | RootCAs: pool,
399 | Certificates: []tls.Certificate{tlscert},
400 | ServerName: conf.HostName,
401 | ClientAuth: tls.RequireAndVerifyClientCert,
402 | ClientCAs: pool,
403 | }, host, donegrp)
404 | }
405 | }()
406 |
407 | // Open TCP and optionally TLS connections to each peer
408 | //println(self, "open TLS connections to", len(host), "peers")
409 | stepgrp := &sync.WaitGroup{}
410 | for i := range host {
411 | // Open an authenticated TLS connection to peer i
412 | peerConf := tls.Config{
413 | RootCAs: pool,
414 | Certificates: []tls.Certificate{tlscert},
415 | ServerName: conf.HostName,
416 | ClientAuth: tls.RequireAndVerifyClientCert,
417 | ClientCAs: pool,
418 | }
419 | peerConf.ServerName = host[i].Name
420 | //println(self, "Dial", host[i].Name, host[i].Addr)
421 | var conn net.Conn
422 | if UseTLS {
423 | conn, err = tls.Dial("tcp", host[i].Addr, &peerConf)
424 | } else {
425 | conn, err = net.Dial("tcp", host[i].Addr)
426 | }
427 | if err != nil {
428 | panic("Dial: " + err.Error())
429 | }
430 |
431 | // Tell the server which client we are.
432 | enc := gob.NewEncoder(conn)
433 | if err := enc.Encode(self); err != nil {
434 | panic("gob.Encode: " + err.Error())
435 | }
436 |
437 | // Set up a peer sender object.
438 | // It signals stepgrp.Done() after enough steps pass.
439 | stepgrp.Add(1)
440 | n.peer[i] = &testPeer{enc, stepgrp, conn}
441 | }
442 | //println(self, "opened TLS connections")
443 |
444 | // Start the consensus test
445 | n.advanceTLC(0)
446 |
447 | // Now we can let the receive goroutines process incoming messages
448 | n.mutex.Unlock()
449 |
450 | // Wait to finish enough consensus rounds
451 | //println(self, "wait for test to complete")
452 | stepgrp.Wait()
453 |
454 | // Report our observed consensus history to the parent
455 | if err := enc.Encode(n.choice); err != nil {
456 | panic("Encode: " + err.Error())
457 | }
458 |
459 | // Finally, wait for our parent to signal when the test is complete.
460 | if err := dec.Decode(&struct{}{}); err != nil {
461 | panic("Decode: " + err.Error())
462 | }
463 |
464 | //println(self, "child finished")
465 | }
466 |
467 | // Accept a new TLS connection on a TCP server socket.
468 | func (n *Node) acceptNetwork(conn net.Conn, tlsConf *tls.Config,
469 | host []testHost, donegrp *sync.WaitGroup) {
470 |
471 | // Enable TLS on the connection and run the handshake.
472 | if UseTLS {
473 | conn = tls.Server(conn, tlsConf)
474 | }
475 | defer func() { conn.Close() }()
476 |
477 | // Receive the client's nodenumber indication
478 | dec := gob.NewDecoder(conn)
479 | var peer int
480 | if err := dec.Decode(&peer); err != nil {
481 | println(n.self, "acceptNetwork gob.Decode: "+err.Error())
482 | return
483 | //panic("acceptNetwork gob.Decode: " + err.Error())
484 | }
485 | if peer < 0 || peer >= len(host) {
486 | println("acceptNetwork: bad peer number")
487 | return
488 | }
489 |
490 | // Authenticate the client with TLS.
491 | // XXX Why doesn't VerifyHostname work to verify a client auth?
492 | // Go TLS bug to report?
493 | //if err := tlsc.VerifyHostname(host[peer].Name); err != nil {
494 | // panic("VerifyHostname: " + err.Error())
495 | //}
496 | if UseTLS {
497 | cs := conn.(*tls.Conn).ConnectionState()
498 | if len(cs.PeerCertificates) < 1 {
499 | println("acceptNetwork: no certificate from client")
500 | return
501 | }
502 | err := cs.PeerCertificates[0].VerifyHostname(host[peer].Name)
503 | if err != nil {
504 | println("VerifyHostname: " + err.Error())
505 | return
506 | }
507 | }
508 |
509 | // Receive and process arriving messages
510 | n.runReceiveNetwork(peer, dec, donegrp)
511 | }
512 |
513 | // Receive messages from a connection and dispatch them into the TLC stack.
514 | func (n *Node) runReceiveNetwork(peer int, dec *gob.Decoder,
515 | grp *sync.WaitGroup) {
516 | for {
517 | // Get next message from this peer
518 | msg := Message{}
519 | err := dec.Decode(&msg)
520 | if err == io.EOF {
521 | break
522 | } else if err != nil {
523 | panic("receiveCausal:" + err.Error())
524 | }
525 | //println(n.self, n.tmpl.Step, "runReceiveNetwork: recv from",
526 | // msg.From, "type", msg.Typ, "seq", msg.Seq,
527 | // "step", msg.Step)
528 |
529 | // Optionally insert random delays on a message basis
530 | time.Sleep(time.Duration(mrand.Int63n(int64(MaxSleep + 1))))
531 |
532 | grp.Add(1)
533 | go n.receiveNetwork(&msg, grp)
534 | }
535 | grp.Done() // signal that we're done
536 | }
537 |
538 | func (n *Node) receiveNetwork(msg *Message, grp *sync.WaitGroup) {
539 |
540 | // Keep the stack single-threaded.
541 | n.mutex.Lock()
542 | defer func() {
543 | n.mutex.Unlock()
544 | grp.Done()
545 | }()
546 |
547 | // Dispatch up to the causal ordering layer
548 | //println(n.self, n.tmpl.Step, "receiveNetwork from", msg.From,
549 | // "type", msg.Typ, "seq", msg.Seq, "vec", len(msg.Vec))
550 | n.receiveCausal(msg)
551 | }
552 |
553 | type testPeer struct {
554 | e *gob.Encoder
555 | w *sync.WaitGroup
556 | c io.Closer
557 | }
558 |
559 | func (tp *testPeer) Send(msg *Message) {
560 | if tp.e != nil {
561 | //println("testPeer.Send seq", msg.Seq, "step", msg.Step,
562 | // "MaxSteps", MaxSteps)
563 | if err := tp.e.Encode(msg); err != nil {
564 | println("Encode:", err.Error())
565 | }
566 | }
567 | if tp.w != nil && MaxSteps > 1 && msg.Step >= MaxSteps {
568 | //println("testPeer.Send done")
569 | tp.w.Done()
570 | tp.w = nil
571 | }
572 | }
573 |
--------------------------------------------------------------------------------
/go/dist/doc.go:
--------------------------------------------------------------------------------
1 | // Package dist implements a minimalistic distributed implementation
2 | // of TLC and QSC for the non-Byzantine (fail-stop) threat model.
3 | // It uses TLS/TCP for communication, gob encoding for serialization, and
4 | // vector time and a basic causal ordering protocol using vector time.
5 | package dist
6 |
--------------------------------------------------------------------------------
/go/dist/node.go:
--------------------------------------------------------------------------------
1 | package dist
2 |
3 | import (
4 | "sync"
5 | )
6 |
7 | // Threshold is the TLC and consensus threshold
8 | var Threshold int
9 |
10 | // MaxTicket is the Amount of entropy in lottery tickets
11 | var MaxTicket int32 = 100
12 |
13 | // Type of message
14 | type Type int
15 |
16 | const (
17 | // Prop is a raw unwitnessed proposal
18 | Prop Type = iota
19 | // Ack is an acknowledgment of a proposal
20 | Ack
21 | // Wit is a threshold witness confirmation of proposal
22 | Wit
23 | )
24 |
25 | // Message over the network
26 | type Message struct {
27 | // Network/peering layer
28 |
29 | // From designates the node which originally sent this message
30 | From int
31 |
32 | // Causality layer
33 | // Seq is the Node-local sequence number for vector time
34 | Seq int
35 | // Vev is the Vector clock update from sender node
36 | Vec vec
37 |
38 | // Threshold time (TLC) layer
39 | // Step is the logical time step this message is for
40 | Step int
41 | // typ is the message type
42 | Typ Type
43 | // Prop is the proposal Seq this Ack or Wit is about
44 | Prop int
45 | // Ticket is the genetic fitness ticket for this proposal
46 | Ticket int32
47 | }
48 |
49 | // Node definition
50 | type Node struct {
51 | // Network/peering layer
52 | self int // This node's participant number
53 | peer []peer // How to send messages to each peer
54 | mutex sync.Mutex // Mutex protecting node's protocol stack
55 |
56 | // Causal history layer
57 | mat []vec // Node's current matrix clock
58 | oom [][]*Message // Out-of-order messages not yet delivered
59 | seqLog [][]*Message // Nodes' message received and delivered by seq
60 | saw []set // Messages each node saw recently
61 | wit []set // Witnessed messages each node saw recently
62 |
63 | // Threshold time (TLC) layer
64 | tmpl Message // Template for messages we send
65 | save int // Earliest step for which we maintain history
66 | acks set // Acknowledgments we've received in this step
67 | wits set // Threshold witnessed messages seen this step
68 | stepLog [][]logEntry // Nodes' messages seen by start of recent steps
69 |
70 | // This node's record of QSC consensus history
71 | choice []choice // Best proposal this node chose each round
72 | }
73 |
74 | type peer interface {
75 | Send(msg *Message)
76 | }
77 |
78 | // Info each node logs about other nodes' views at the start of each time-step
79 | type logEntry struct {
80 | saw set // All nodes' messages the node had seen by then
81 | wit set // Threshold witnessed messages it had seen
82 | }
83 |
84 | // Record of one node's QSC decision in one time-step
85 | type choice struct {
86 | best int // Best proposal this node chose in this round
87 | commit bool // Whether node observed successful commitment
88 | }
89 |
90 | func (n *Node) init(self int, peer []peer) {
91 | n.self = self
92 | n.peer = peer
93 |
94 | n.initCausal()
95 | n.initTLC()
96 | }
97 |
--------------------------------------------------------------------------------
/go/dist/qsc.go:
--------------------------------------------------------------------------------
1 | package dist
2 |
3 | // RoundSteps is three because the witnessed QSC requires three TLC
4 | // time-steps per consensus round.
5 | const RoundSteps = 3
6 |
7 | // The TLC layer upcalls this method on advancing to a new time-step,
8 | // with sets of proposals seen (saw) and threshold witnessed (wit) recently.
9 | func (n *Node) advanceQSC(saw, wit set) {
10 | //println(n.self, n.tmpl.Step, "advanceQSC saw", len(saw),
11 | // "wit", len(wit))
12 |
13 | // Calculate the starting step of the round that's just now completing.
14 | s := n.tmpl.Step - RoundSteps
15 | if s < 0 {
16 | return // Nothing to be done until the first round completes
17 | }
18 |
19 | // Find the best eligible proposal that was broadcast at s+0
20 | // and that is in our view by the end of the round at s+3.
21 | var bestProp *Message
22 | var bestTicket int32
23 | for p := range wit {
24 | if p.Typ != Prop {
25 | panic("wit should contain only proposals")
26 | }
27 | if p.Step == s+0 && p.Ticket >= bestTicket {
28 | bestProp = p
29 | bestTicket = p.Ticket
30 | }
31 | }
32 |
33 | // Determine if we can consider this proposal permanently committed.
34 | spoiled := n.spoiledQSC(s, saw, bestProp, bestTicket)
35 | reconfirmed := n.reconfirmedQSC(s, wit, bestProp)
36 | committed := !spoiled && reconfirmed
37 |
38 | // Record the consensus results for this round (from s to s+3).
39 | n.choice = append(n.choice, choice{bestProp.From, committed})
40 | //println(n.self, n.tmpl.Step, "choice", bestProp.From, "spoiled", spoiled,
41 | // "reconfirmed", reconfirmed, "committed", committed)
42 |
43 | // Don't bother saving history before the start of the next round.
44 | n.save = s + 1
45 | }
46 |
47 | // Return true if there's another proposal competitive with a given candidate.
48 | func (n *Node) spoiledQSC(s int, saw set, prop *Message, ticket int32) bool {
49 | for p := range saw {
50 | if p.Step == s+0 && p.Typ == Prop && p != prop &&
51 | p.Ticket >= ticket {
52 | return true // victory spoiled by competition!
53 | }
54 | }
55 | return false
56 | }
57 |
58 | // Return true if given proposal was doubly confirmed (reconfirmed).
59 | func (n *Node) reconfirmedQSC(s int, wit set, prop *Message) bool {
60 | for p := range wit { // search for a paparazzi witness at s+1
61 | if p.Step == s+1 && n.stepLog[p.From][s+1].wit.has(prop) {
62 | return true
63 | }
64 | }
65 | return false
66 | }
67 |
--------------------------------------------------------------------------------
/go/dist/set.go:
--------------------------------------------------------------------------------
1 | package dist
2 |
3 | // Use a map to represent a set of messages
4 | type set map[*Message]struct{}
5 |
6 | // Test if msg is in set s.
7 | func (s set) has(msg *Message) bool {
8 | _, present := s[msg]
9 | return present
10 | }
11 |
12 | // Add msg to set s.
13 | func (s set) add(msg *Message) {
14 | s[msg] = struct{}{}
15 | }
16 |
17 | // Return a copy of message set s,
18 | // dropping any messages before earliest.
19 | func (s set) copy(earliest int) set {
20 | n := make(set)
21 | for k, v := range s {
22 | if k.Step >= earliest {
23 | n[k] = v
24 | }
25 | }
26 | return n
27 | }
28 |
--------------------------------------------------------------------------------
/go/dist/tlc.go:
--------------------------------------------------------------------------------
1 | package dist
2 |
3 | import (
4 | "math/rand"
5 | )
6 |
7 | // Initialize the TLC layer state in a Node
8 | func (n *Node) initTLC() {
9 | n.tmpl = Message{From: n.self, Step: -1}
10 | n.stepLog = make([][]logEntry, len(n.peer))
11 | }
12 |
13 | // Broadcast a copy of our current message template to all nodes
14 | func (n *Node) broadcastTLC() *Message {
15 |
16 | //println(n.self, n.tmpl.Step, "broadcast", msg, "typ", msg.Typ)
17 | msg := n.tmpl
18 | n.broadcastCausal(&msg)
19 | return &msg
20 | }
21 |
22 | // Unicast an acknowledgment of a given proposal to its sender
23 | func (n *Node) acknowledgeTLC(prop *Message) {
24 |
25 | msg := n.tmpl
26 | msg.Typ = Ack
27 | msg.Prop = prop.Seq
28 | n.sendCausal(prop.From, &msg)
29 | }
30 |
31 | // Advance to a new time step.
32 | func (n *Node) advanceTLC(step int) {
33 | //println(n.self, step, "advanceTLC",
34 | // "saw", len(n.saw[n.self]), "wit", len(n.wit[n.self]))
35 |
36 | // Initialize our message template for new time step
37 | n.tmpl.Step = step // Advance to new time step
38 | n.tmpl.Typ = Prop // Raw unwitnessed proposal message initially
39 | n.tmpl.Ticket = rand.Int31n(MaxTicket) // Choose a ticket
40 |
41 | n.acks = make(set) // No acknowledgments received yet in this step
42 | n.wits = make(set) // No threshold witnessed messages received yet
43 |
44 | // Notify the upper (QSC) layer of the advancement of time,
45 | // and let it fill in its part of the new message to broadcast.
46 | n.advanceQSC(n.saw[n.self], n.wit[n.self])
47 |
48 | prop := n.broadcastTLC() // broadcast our raw proposal
49 | n.tmpl.Prop = prop.Seq // save proposal's sequence number
50 | n.acks.add(prop) // automatically self-acknowledge it
51 | }
52 |
53 | func (n *Node) receiveTLC(msg *Message) {
54 |
55 | // Now process this message according to type.
56 | //println(n.self, n.tmpl.Step, "receivedTLC from", msg.From,
57 | // "step", msg.Step, "typ", msg.Typ)
58 | switch msg.Typ {
59 | case Prop: // A raw unwitnessed proposal broadcast.
60 |
61 | // Record the set of messages this node had seen
62 | // by the time it advanced to this new time-step.
63 | if len(n.stepLog[msg.From]) != msg.Step {
64 | panic("out of sync")
65 | }
66 | n.stepLog[msg.From] = append(n.stepLog[msg.From],
67 | logEntry{n.saw[msg.From], n.wit[msg.From]})
68 |
69 | // Continue from pruned copies in the next time step
70 | n.saw[msg.From] = n.saw[msg.From].copy(n.save)
71 | n.wit[msg.From] = n.wit[msg.From].copy(n.save)
72 |
73 | if msg.Step == n.tmpl.Step {
74 | //println(n.self, n.tmpl.Step, "ack", msg.From)
75 | n.acknowledgeTLC(msg)
76 | }
77 |
78 | case Ack: // An acknowledgment. Collect a threshold of acknowledgments.
79 | if msg.Prop == n.tmpl.Prop { // only if it acks our proposal
80 | n.acks.add(msg)
81 | //println(n.self, n.tmpl.Step, "got ack", len(n.acks))
82 | if n.tmpl.Typ == Prop && len(n.acks) >= Threshold {
83 |
84 | // Broadcast a threshold-witnesed certification
85 | n.tmpl.Typ = Wit
86 | n.broadcastTLC()
87 | }
88 | }
89 |
90 | case Wit: // A threshold-witnessed message. Collect a threshold of them.
91 | prop := n.seqLog[msg.From][msg.Prop]
92 | if prop.Typ != Prop {
93 | panic("doesn't refer to a proposal!")
94 | }
95 | if msg.Step == n.tmpl.Step {
96 |
97 | // Collect a threshold of Wit witnessed messages.
98 | n.wits.add(prop) // witnessed messages in this step
99 | if len(n.wits) >= Threshold {
100 |
101 | // We've met the condition to advance time.
102 | n.advanceTLC(n.tmpl.Step + 1)
103 | }
104 | }
105 | }
106 | }
107 |
--------------------------------------------------------------------------------
/go/dist/vec.go:
--------------------------------------------------------------------------------
1 | package dist
2 |
3 | // Vector timestemp
4 | type vec []int
5 |
6 | // Return a copy of this vector
7 | func (v vec) copy() vec {
8 | return append(vec{}, v...)
9 | }
10 |
11 | // Return true if vector timestamp v is causally before or equal to y.
12 | func (v vec) le(y vec) bool {
13 | for i := range v {
14 | if v[i] > y[i] {
15 | return false
16 | }
17 | }
18 | return true
19 | }
20 |
21 | // Set v to the elementwise maximum of vectors x and y.
22 | // Inputs x and/or y can be the same as target v.
23 | func (v vec) max(x, y vec) {
24 | for i := range v {
25 | if x[i] > y[i] {
26 | v[i] = x[i]
27 | } else {
28 | v[i] = y[i]
29 | }
30 | }
31 | }
32 |
33 | //func (v vec) String() {
34 | // fmt.Sprintf("%v", []int(v))
35 | //}
36 |
--------------------------------------------------------------------------------
/go/lib/backoff/retry.go:
--------------------------------------------------------------------------------
1 | // This package converts errors into time delays via random exponential backoff.
2 | // It is designed to be extremely simple to use but robust and automatic.
3 | //
4 | package backoff
5 |
6 | import (
7 | "context"
8 | "log"
9 | "math/rand"
10 | "time"
11 | )
12 |
13 | // Retry calls try() repeatedly until it returns without an error,
14 | // with the default exponential backoff configuration.
15 | //
16 | // By default, Retry continues to try forever until it succeeds.
17 | // The caller may pass a cancelable context in the ctx parameter, however,
18 | // in case Retry will give up calling try when the context is cancelled.
19 | // If the context was already cancelled on the call to Retry,
20 | // then Retry returns ctx.Err() immediately without calling try.
21 | //
22 | func Retry(ctx context.Context, try func() error) error {
23 | return Config{}.Retry(ctx, try)
24 | }
25 |
26 | // Config represents configuration parameters for exponential backoff.
27 | // To use, initialize a Config structure with the desired parameters
28 | // and then call Config.Retry().
29 | //
30 | // Report, if non-nil, is a function called by Retry to report errors
31 | // in an appropriate fashion specific to the application.
32 | // If nil, Retry reports errors via log.Println by default.
33 | // Report may also return a non-nil error to abort the Retry loop if it
34 | // determines that the detected error is permanent and waiting will not help.
35 | //
36 | type Config struct {
37 | Report func(error) error // Function to report errors
38 | MaxWait time.Duration // Maximum backoff wait period
39 |
40 | mayGrow struct{} // Ensure Config remains extensible
41 | }
42 |
43 | func defaultReport(err error) error {
44 | log.Println(err.Error())
45 | return nil
46 | }
47 |
48 | // Retry calls try() repeatedly until it returns without an error,
49 | // using exponential backoff configuration c.
50 | func (c Config) Retry(ctx context.Context, try func() error) error {
51 |
52 | // Make sure we have a valid error reporter
53 | if c.Report == nil {
54 | c.Report = defaultReport
55 | }
56 |
57 | // Return immediately if ctx was already cancelled
58 | if ctx.Err() != nil {
59 | return ctx.Err()
60 | }
61 |
62 | backoff := time.Duration(1) // minimum backoff duration
63 | for {
64 | before := time.Now()
65 | err := try()
66 | if err == nil { // success
67 | return nil
68 | }
69 | elapsed := time.Since(before)
70 |
71 | // Report the error as appropriate
72 | err = c.Report(err)
73 | if err != nil {
74 | return err // abort the retry loop
75 | }
76 |
77 | // Wait for an exponentially-growing random backoff period,
78 | // with the duration of each operation attempt as the minimum
79 | if backoff <= elapsed {
80 | backoff = elapsed
81 | }
82 | backoff += time.Duration(rand.Int63n(int64(backoff)))
83 | if c.MaxWait > 0 && backoff > c.MaxWait {
84 | backoff = c.MaxWait
85 | }
86 |
87 | // Wait for either the backoff timer or a cancel signal.
88 | t := time.NewTimer(backoff)
89 | select {
90 | case <-t.C: // Backoff timer expired
91 | continue
92 |
93 | case <-ctx.Done(): // Our context got cancelled
94 | t.Stop()
95 | return ctx.Err()
96 | }
97 | }
98 | }
99 |
--------------------------------------------------------------------------------
/go/lib/backoff/retry_test.go:
--------------------------------------------------------------------------------
1 | package backoff
2 |
3 | import (
4 | "context"
5 | "errors"
6 | "fmt"
7 | "testing"
8 | "time"
9 | )
10 |
11 | func TestRetry(t *testing.T) {
12 |
13 | n := 0
14 | try := func() error {
15 | n++
16 | if n < 30 {
17 | return errors.New(fmt.Sprintf("test error %d", n))
18 | }
19 | return nil
20 | }
21 | Retry(context.Background(), try)
22 | }
23 |
24 | func TestTimeout(t *testing.T) {
25 |
26 | ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
27 | try := func() error {
28 | return errors.New("haha, never going to succeed")
29 | }
30 | if err := Retry(ctx, try); err != context.DeadlineExceeded {
31 | t.Errorf("got wrong error from Retry: %v", err.Error())
32 | }
33 |
34 | // Now test with an already-cancelled context
35 | try = func() error {
36 | panic("shouldn't get here!")
37 | }
38 | if err := Retry(ctx, try); err != context.DeadlineExceeded {
39 | t.Errorf("got wrong error from Retry: %v", err.Error())
40 | }
41 |
42 | // for good measure
43 | cancel()
44 | }
45 |
--------------------------------------------------------------------------------
/go/lib/backoff/rfq/doc.go:
--------------------------------------------------------------------------------
1 | // Package rfq implements responsively-fair queueing (RFQ).
2 | // or distributed queueing? RFDQ
3 | //
4 | // If a server has limited resources to serve a potentially-unlimited
5 | // number of clients (especially in a flash crowd or DDoS attack setting),
6 | // we wish to allocate the server's limited resources fairly among clients,
7 | // so that (for example) fast clients cannot indefinitely starve slower ones.
8 | // This definition of fairness implies wait-freedom, i.e., lack of starvation.
9 | // We would like the server to be able to serve an arbitrary number of clients
10 | // in a fair or at least starvation-free way, with only constant server state.
11 | //
12 | // One approach is for the server to organize the clients into a literal queue,
13 | // with each client responsible for remembering who is next in the queue,
14 | // so the space required for next pointers is distributed among the clients.
15 | // This is what queue-based multiprocessor shared memory locking algorithms do.
16 | // But it works only when the clients are perfectly reliable and trustworthy:
17 | // if not, a single crashed client breaks the chain
18 | // and leaves all clients waiting behind it "dangling" and blocked forever
19 | // (at last without introducing timeouts or similar recovery mechanisms).
20 | //
21 | // Another baseline approach is to have all clients run a backoff algorithm
22 | // when they submit a request that the server must reject due to a full queue.
23 | // This approach might be statistically fair and starvation-free
24 | // if all the clients have similar processing speed and connectivity,
25 | // but clients that are much faster than others can starve slow clients.
26 | // This is because if some number of fast clients can saturate the server,
27 | // re-filling the server's queue only a brief moment after space opens up,
28 | // and a slow client's network round-trip time and/or client-side delay
29 | // add up to significantly more than the server's work-item processing time,
30 | // then each time the slow client attempts to retry it will always find
31 | // that the server's queue has already been filled again by the fast clients.
32 | //
33 | // A next-step solution to ensure approximate fairness across all clients
34 | // would be for the server to propagate maximum backoff delays among clients.
35 | // For example, suppose a slow client attempts to submit a request,
36 | // is rejected due to a full server queue, resubmits it t ms later,
37 | // and is again rejected, increasing its backoff timer to 2t ms.
38 | // If the original t ms value was dominated by client-side or network delays,
39 | // and the work-item processing times for fast clients is significantly less,
40 | // then with independent backoff delays the slow client will be starved.
41 | // But if the server notices that the slow client has backed off to 2ms,
42 | // and in response forces *all* clients to use a maximum backoff of 2ms
43 | // until the slow client's request has been satisfied,
44 | // then the slow client will no longer be starved
45 | // and allocation of the server's resources will be approximately fair.
46 | //
47 | // This approach fails to be responsive, however: the server's response times
48 | // to fast clients are slowed to that of the slowest client at a given time.
49 | // This approach can also greatly underutilize the server's resources:
50 | // the server may be perfectly able to process many work-items every 2ms,
51 | // but has slowed itself down to the rate of the slowest client for fairness.
52 | // Pursuing such strong fairness also creates DoS attack vectors,
53 | // since it is trivial for a misbehaved client simply to pretend to be slow.
54 | // In practice we cannot achieve both perfect fairness and responsiveness:
55 | // utilizing the server's full capacity to service fast clients quickly
56 | // inherently means that fast clients obtain more resources than slow clients.
57 | // But we would still like to be "reasonably" fair while also responsive,
58 | // and particualrly to ensure that no client, however slow, is starved.
59 | //
60 | // RFQ thus provides "responsively-fair queueing",
61 | // which ensures statistical fairness among clients that see similar delays,
62 | // and similarly ensures fairness among clients in different delay classes.
63 | // ...
64 | //
65 | // Server has a limited internal queue, which it keeps sorted
66 | // oldest-request-first as judged by the server's own clock.
67 | // An externally-queued request can bump an internally-queued request
68 | // if the server has previously outsourced it to the client and forgotten it
69 | // but its approximate service time has arrived and the client resubmitted it.
70 | //
71 | // Tolerating misbehaving (Byzantine) clients:
72 | // use server-side MAC (and optionally encryption) to protect the state
73 | // the server outsources to clients.
74 | //
75 | // Issue: replay attacks, since server doesn't have storage to remember
76 | // which tokens have and haven't been "used" or how many times.
77 | // Full processing might reveal and neutralize the effect of a replay --
78 | // e.g., where a cryptocurrency server finds a UTXO was already spent --
79 | // but full processing might be significantly more costly in resources.
80 | // One simple defense is to have the server keep a record (e.g., hash table)
81 | // of all the tokens that have been processed within some past epoch.
82 | // The server can then detect and trivially discard replays within one epoch,
83 | // thereby rate-limiting the effectiveness of token replays to one per epoch.
84 | // This takes storage linear in the epoch length and server's processing rate,
85 | // but is independent of the number clients contending to submit requests.
86 | //
87 | // If it is acceptable to impose a maximum round-trip delay on any client,
88 | // denying service to clients that can't resubmit a request within one epoch,
89 | // then the server can presume requests from earlier epochs to be replays
90 | // and reject them unconditionally, thereby eliminating replay attacks.
91 | //
92 | package rfq
93 |
--------------------------------------------------------------------------------
/go/lib/cas/cas.go:
--------------------------------------------------------------------------------
1 | // Package cas defines a simple compare-and-set (CAS) state interface.
2 | // It defines a generic access interface called Store,
3 | // and a simple in-memory CAS register called Register.
4 | //
5 | package cas
6 |
7 | import (
8 | "context"
9 | "sync"
10 | )
11 |
12 | // Store defines a CAS storage abstraction via a single CompareAndSet method.
13 | //
14 | // CompareAndSet writes a proposed new value to the state,
15 | // provided the state still has the specified old value.
16 | // The compare and conditional write are guaranteed to be atomic,
17 | // ensuring that the caller can avoid undetected state loss due to races.
18 | // CompareAndSet then reads and returns the latest actual state value.
19 | //
20 | // State values are arbitrary opaque Go strings, and may contain binary data.
21 | // While values in principle have no particular length limit, in practice
22 | // Store implementations may expect them to be "reasonably small", i.e.,
23 | // efficient for storing metadata but not necessarily for bulk data storage.
24 | //
25 | // The Store assigns a version number to each value CompareAndSet returns.
26 | // Version numbers must be monotonic but need not be assigned consecutively.
27 | // The version number must increase when the stored value changes,
28 | // and may increase at other times even when the value hasn't changed.
29 | // The caller may simply ignore the version numbers CompareAndSet returns,
30 | // or may use them for consistency-checking and debugging:
31 | // see the Checked wrapper function in the test subpackage for example.
32 | // Version numbers do not impose a burden on Store interface implementations,
33 | // in part because it's easy to adapt a non-versioned underlying CAS interface
34 | // with a simple wrapper that attaches a version number to each proposed value.
35 | //
36 | // CompareAndSet takes a Context parameter so that long-running implementations,
37 | // particularly those accessing remote storage in a distributed system,
38 | // can respond to cancellation requests and timeouts appropriately.
39 | // For robust asynchronous operation, CompareAndSet should return err != nil
40 | // only when its context is cancelled or when it encounters an error
41 | // that it detects to be permanent and unrecoverable for sure.
42 | // On encountering errors that may be temporary (e.g., due to network outages),
43 | // it is better for the Store to keep trying until success or cancellation,
44 | // using the lib/backoff package for example.
45 | //
46 | type Store interface {
47 | CompareAndSet(ctx context.Context, old, new string) (
48 | version int64, actual string, err error)
49 | }
50 |
51 | // Register implements a simple local-memory CAS register.
52 | // It is thread-safe and ready for use on instantiation.
53 | type Register struct {
54 | mut sync.Mutex // for synchronizing accesses
55 | ver int64 // version number of the latest value
56 | val string // the latest value written
57 | }
58 |
59 | // CompareAndSet implements the Store interface for the CAS register.
60 | func (r *Register) CompareAndSet(ctx context.Context, old, new string) (
61 | version int64, actual string, err error) {
62 |
63 | r.mut.Lock()
64 | defer r.mut.Unlock()
65 |
66 | // Update the value only if the current value is as expected.
67 | if r.val == old {
68 | r.ver, r.val = r.ver+1, new
69 | }
70 |
71 | // Return the actual new value, changed or not.
72 | return r.ver, r.val, nil
73 | }
74 |
75 |
--------------------------------------------------------------------------------
/go/lib/cas/test/cas.go:
--------------------------------------------------------------------------------
1 | // Package test implements shareable code for testing instantiations
2 | // of the cas.Store check-and-set storage interface.
3 | package test
4 |
5 | import (
6 | "context"
7 | "fmt"
8 | "math/rand"
9 | "sync"
10 | "testing"
11 |
12 | "github.com/dedis/tlc/go/lib/cas"
13 | )
14 |
15 | // History records a history of cas.Store version/value observations,
16 | // typically made across concurrent goroutines or even distributed nodes,
17 | // and checks all these observations for consistency.
18 | //
19 | type History struct {
20 | hist map[int64]string // version-value map defining observed history
21 | mut sync.Mutex // mutex protecting this reference order
22 | }
23 |
24 | // Observe records an old/new value pair that was observed via a cas.Store,
25 | // checks it for consistency against all prior recorded old/new value pairs,
26 | // and reports any errors via testing context t.
27 | //
28 | func (to *History) Observe(t *testing.T, version int64, value string) {
29 | to.mut.Lock()
30 | defer to.mut.Unlock()
31 |
32 | // Create the version/value map if it doesn't already exist
33 | if to.hist == nil {
34 | to.hist = make(map[int64]string)
35 | }
36 |
37 | // If there is a recorded value for this version, it must be the same.
38 | if old, exist := to.hist[version]; exist && old != value {
39 | t.Errorf("\nInconsistency:\n ver %v\n old %q\n new %q\n",
40 | version, old, value)
41 | }
42 |
43 | // Record the new successor
44 | to.hist[version] = value
45 | }
46 |
47 | // Checked wraps the provided CAS store with a consistency-checker
48 | // that records all requested and observed accesses against history h,
49 | // reporting any inconsistency errors discovered via testing context t.
50 | //
51 | // The wrapper also consistency-checks the caller's accesses to the Store,
52 | // e.g., that the provided old value is indeed the last version retrieved.
53 | // This means that when checking a Store that is shared across goroutines,
54 | // each goroutine must have its own Checked wrapper around that Store.
55 | //
56 | func Checked(t *testing.T, h *History, store cas.Store) cas.Store {
57 | return &checkedStore{t: t, h: h, s: store}
58 | }
59 |
60 | type checkedStore struct {
61 | t *testing.T // Testing context
62 | h *History // History we're using for consistency-checking
63 | s cas.Store // Underlying compare-and-set Store
64 |
65 | lver int64 // Last version number read from the underlying Store
66 | lval string // Last value read from the underlying Store
67 |
68 | rver int64 // Our fudged informational version numbers for testing
69 | }
70 |
71 | func (cs *checkedStore) CompareAndSet(ctx context.Context, old, new string) (
72 | version int64, actual string, err error) {
73 |
74 | // Sanity-check the arguments we're passed
75 | if old != cs.lval {
76 | cs.t.Errorf("CompareAndSet: wrong old value %q != %q",
77 | old, cs.lval)
78 | }
79 | if new == "" {
80 | cs.t.Errorf("CompareAndSet: new value empty")
81 | }
82 | if new == old {
83 | cs.t.Errorf("CompareAndSet: new value identical to old")
84 | }
85 |
86 | // Try to change old to new atomically.
87 | version, actual, err = cs.s.CompareAndSet(ctx, old, new)
88 |
89 | // Sanity-check the Store-assigned version numbers
90 | if version < cs.lver {
91 | cs.t.Errorf("CompareAndSet: Store version number decreased")
92 | }
93 | if version == cs.lver && actual != cs.lval {
94 | cs.t.Errorf("CompareAndSet: Store version failed to increase")
95 | }
96 |
97 | // Record and consistency-check all version/value pairs we observe.
98 | cs.h.Observe(cs.t, version, actual)
99 |
100 | // Produce our own informational version numbers to return
101 | // that increase a bit unpredictability for testing purposes.
102 | if version > cs.lver {
103 | cs.rver++
104 | }
105 | cs.rver += rand.Int63n(3)
106 |
107 | // Update our cached record of the underlying Store's last state
108 | cs.lver, cs.lval = version, actual
109 |
110 | // Return the actual new value regardless.
111 | return cs.rver, actual, err
112 | }
113 |
114 | // Stores torture-tests one or more cas.Store interfaces
115 | // that are all supposed to represent the same consistent underlying state.
116 | // The test is driven by nthreads goroutines per Store interface,
117 | // each of which performs naccesses CAS operations on its interface.
118 | //
119 | func Stores(t *testing.T, nthreads, naccesses int, store ...cas.Store) {
120 |
121 | bg := context.Background()
122 | wg := sync.WaitGroup{}
123 | h := &History{}
124 |
125 | tester := func(i, j int) {
126 | cs := Checked(t, h, store[i])
127 | old, err := "", error(nil)
128 | for k := 0; k < naccesses; k++ {
129 | new := fmt.Sprintf("store %v thread %v access %v",
130 | i, j, k)
131 | //println("tester", i, j, "access", k)
132 | _, old, err = cs.CompareAndSet(bg, old, new)
133 | if err != nil {
134 | t.Error("CompareAndSet: " + err.Error())
135 | }
136 | }
137 | //println("tester", i, j, "done")
138 | wg.Done()
139 | }
140 |
141 | // Launch a set of goroutines for each Store interface.
142 | // To maximize cross-store concurrency,
143 | // launch the first thread per store, then the second per store, etc.
144 | for j := 0; j < nthreads; j++ {
145 | for i := range store {
146 | wg.Add(1)
147 | go tester(i, j)
148 | }
149 | }
150 |
151 | // Wait for all tester goroutines to complete
152 | wg.Wait()
153 | }
154 |
--------------------------------------------------------------------------------
/go/lib/cas/test/cas_test.go:
--------------------------------------------------------------------------------
1 | package test
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/dedis/tlc/go/lib/cas"
7 | )
8 |
9 | // Test the Client with a trivial in-memory key/value Store implementation.
10 | func TestRegister(t *testing.T) {
11 | Stores(t, 100, 100000, &cas.Register{})
12 | }
13 |
--------------------------------------------------------------------------------
/go/lib/doc.go:
--------------------------------------------------------------------------------
1 | // Sub-packages of this package contains common library functionality
2 | // useful in implementations of threshold logical clocks and consensus.
3 | package lib
4 |
--------------------------------------------------------------------------------
/go/lib/fs/atomic/atomic.go:
--------------------------------------------------------------------------------
1 | // This package supports writing files atomically
2 | // while ensuring "at-most-once" semantics.
3 | package atomic
4 |
5 | import (
6 | "errors"
7 | "fmt"
8 | "io/ioutil"
9 | "os"
10 | "path/filepath"
11 | )
12 |
13 | // WriteFileOnce attempts to write data to filename atomically, only once,
14 | // failing with ErrExist if someone else already wrote a file at filename.
15 | //
16 | // Ensures that no one ever sees a zero-length or incomplete file
17 | // at the target filename, by writing data to a temporary file first,
18 | // synchronizing it to stable storage, then atomically linking it into place.
19 | //
20 | // This code solves a different problem from, but is partly inspired by:
21 | // https://github.com/google/renameio
22 | // https://github.com/natefinch/atomic
23 |
24 | func WriteFileOnce(filename string, data []byte, perm os.FileMode) error {
25 |
26 | // Create a temporary file in the target directory,
27 | // mainly to ensure that it's on the same volume for hard linking.
28 | dir, name := filepath.Split(filename)
29 | pattern := fmt.Sprintf("%s-*.tmp", name)
30 | tmpfile, err := ioutil.TempFile(dir, pattern)
31 | if err != nil {
32 | return err
33 | }
34 |
35 | // Make sure it gets closed and removed regardless of outcome.
36 | tmpname := tmpfile.Name()
37 | defer func() {
38 | tmpfile.Close()
39 | os.Remove(tmpname)
40 | }()
41 |
42 | // Write the data to the temporary file.
43 | n, err := tmpfile.Write(data)
44 | if err != nil {
45 | return err
46 | }
47 | if n < len(data) {
48 | return errors.New("short write")
49 | }
50 |
51 | // Set the correct file permissions
52 | if err := tmpfile.Chmod(perm); err != nil {
53 | return err
54 | }
55 |
56 | // Force the newly-written data to stable storage.
57 | // For background on this see commends for CloseAtomicallyReplace
58 | // at https://github.com/google/renameio/blob/master/tempfile.go
59 | //
60 | if err := tmpfile.Sync(); err != nil {
61 | return err
62 | }
63 |
64 | if err := tmpfile.Close(); err != nil {
65 | return err
66 | }
67 |
68 | // Atomically hard-link the temporary file into the target filename.
69 | // Unlike os.Rename, this fails if target filename already exists.
70 | if err := os.Link(tmpname, filename); err != nil {
71 | return err
72 | }
73 |
74 | return nil
75 | }
76 |
--------------------------------------------------------------------------------
/go/lib/fs/atomic/atomic_test.go:
--------------------------------------------------------------------------------
1 | package atomic
2 |
3 | import (
4 | "io/ioutil"
5 | "math/rand"
6 | "os"
7 | "sync"
8 | "testing"
9 | "time"
10 | )
11 |
12 | func TestWriteFileOnce(t *testing.T) {
13 |
14 | filename := "testfile.tmp"
15 | var wg sync.WaitGroup
16 | writer := func(i int) {
17 |
18 | // Sleep a small random duration to jitter the test
19 | time.Sleep(time.Duration(rand.Int63n(int64(time.Microsecond))))
20 | //println("thread",i,"writing")
21 |
22 | // Try to write the file
23 | b := make([]byte, i) // create i-length file filled with i's
24 | for j := range b {
25 | b[j] = byte(i)
26 | }
27 | err := WriteFileOnce(filename, b, 0644)
28 | if err != nil && !os.IsExist(err) {
29 | t.Error("WriteFileOnce:", err)
30 | }
31 |
32 | // Now try to read the file that got written
33 | b, err = ioutil.ReadFile(filename)
34 | if err != nil {
35 | t.Error("ReadFile", err)
36 | }
37 |
38 | // Check that what we read back is valid
39 | //println("thread",i,"read",len(b))
40 | i = len(b)
41 | if i == 0 {
42 | t.Error("zero-length file shouldn't be possible")
43 | }
44 | for j := range b {
45 | if b[j] != byte(i) {
46 | t.Error("read file has wrong byte at", j)
47 | }
48 | }
49 |
50 | wg.Done()
51 | }
52 |
53 | // Test with increasing numbers of threads
54 | for n := 1; n <= 128; n *= 2 {
55 |
56 | //println("\ntesting", n, "threads")
57 | for i := 1; i <= n; i++ {
58 | wg.Add(1)
59 | go writer(i)
60 | }
61 | wg.Wait()
62 |
63 | os.Remove(filename)
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/go/lib/fs/casdir/state.go:
--------------------------------------------------------------------------------
1 | // Package casdir implements a versioned check-and-set (CAS) state abstraction
2 | // in a directory on a standard POSIX-compatible file system.
3 | //
4 | // See the tlc/go/lib/cas package for general information
5 | // on this CAS state abstraction.
6 | //
7 | // This implementation is just a simple wrapper around the verst package,
8 | // which provides a slightly-more-general versioned state abstraction.
9 | // To implement CAS, in essence, we simply expire old versions immediately
10 | // as soon as any new version is written.
11 | //
12 | package casdir
13 |
14 | import (
15 | "context"
16 |
17 | "github.com/dedis/tlc/go/lib/fs/verst"
18 | )
19 |
20 | // Store implements the compare-and-set state abstraction
21 | // generically defined by the cas.Store interface,
22 | // holding the underlying state in a POSIX directory.
23 | //
24 | // The underlying state directory may be shared locally or remotely
25 | // (e.g., via NFS-mounted file systems),
26 | // provided that file system accesses ensure file-level POSIX atomicity.
27 | //
28 | // Each Store instance is intended for use by only one goroutine at a time,
29 | // so the client must synchronize shared uses across multiple goroutines.
30 | //
31 | type Store struct {
32 | vs verst.State // underlying versioned state
33 | lver int64 // last version we've read
34 | lval string // application value associated with lver
35 | }
36 |
37 | // Init sets Store to refer to a CAS register at a given file system path.
38 | // If create is true, creates the designated directory if it doesn't exist.
39 | // If excl is true, fails if the designated directory already exists.
40 | //
41 | func (st *Store) Init(path string, create, excl bool) error {
42 | return st.vs.Init(path, create, excl)
43 | }
44 |
45 | // CompareAndSet writes value new provided the state still holds value old,
46 | // then reads and returns the actual current state version and value.
47 | //
48 | func (st *Store) CompareAndSet(ctx context.Context, old, new string) (
49 | version int64, actual string, err error) {
50 |
51 | if old != st.lval {
52 | panic("CompareAndSet: wrong old value")
53 | }
54 |
55 | // Try to write the new version to the underlying versioned store -
56 | // but don't fret if someone else wrote it or if it has expired.
57 | ver := st.lver + 1
58 | err = st.vs.WriteVersion(ver, new)
59 | if err != nil && !verst.IsExist(err) && !verst.IsNotExist(err) {
60 | return 0, "", err
61 | }
62 |
63 | // Now read back whatever value was successfully written.
64 | val, err := st.vs.ReadVersion(ver)
65 | if err != nil && verst.IsNotExist(err) {
66 |
67 | // The requested version has probably been aged out,
68 | // so catch up to the most recent committed value.
69 | ver, val, err = st.vs.ReadLatest()
70 | }
71 | if err != nil {
72 | return 0, "", err
73 | }
74 |
75 | // Expire all versions before this latest one
76 | st.vs.Expire(ver)
77 |
78 | // Return the actual version and value that we read
79 | st.lver, st.lval = ver, val
80 | return ver, val, err
81 | }
82 |
--------------------------------------------------------------------------------
/go/lib/fs/verst/state.go:
--------------------------------------------------------------------------------
1 | // Package verst implements a simple persistent versioned state abstraction
2 | // in a directory on a standard POSIX-compatible file system.
3 | //
4 | // The abstraction that verst presents is essentially a key/value store,
5 | // in which the keys are sequentially-increasing version numbers,
6 | // and the values are opaque byte strings (which we represent as Go strings).
7 | // The main operations verst provides are
8 | // reading a particular (or the latest) version,
9 | // and writing a new version as a successor to the latest version.
10 | // The implementation ensures that new version writes are atomic:
11 | // clients will never read partially-written values, for example.
12 | // If several clients attempt to write the same new version concurrently,
13 | // one will succeed while all the others will fail,
14 | // and potentially need to retry with respect to the new latest version.
15 | //
16 | // The package is designed assuming that values are small,
17 | // e.g., metadata rather than bulk data, appropriate for Go strings.
18 | // and reading/writing all at once as atomic units.
19 | // Bulk data should be handled by other means.
20 | //
21 | // The verst package uses simple atomic POSIX file system operations,
22 | // with no locking, to manage concurrency in the underlying file system.
23 | // It supports garbage-collection of old state versions
24 | // by using atomic POSIX directory-manipulation operations.
25 | // Barring bugs, it "should" not be possible to violate
26 | // the guaranteed atomicity properties or corrupt the state store
27 | // regardless of how many clients may be competing to access it
28 | // or with what access patterns or delays.
29 | // This atomicity is necessarily only as good as the underlying file system's
30 | // guarantee of atomicity and consistency of the underlying operations:
31 | // e.g., if the underlying file system can leave a rename operation
32 | // half-completed after a badly-timed crash, the state could be corrupted.
33 | //
34 | // The design of verst guarantees progress, but not fairness:
35 | // that is, by standard definitions it is lock-free but not wait free
36 | // (https://en.wikipedia.org/wiki/Non-blocking_algorithm).
37 | // Regardless of the amount of contention to write a new version, for example,
38 | // verst guarantees that at least one client will be able to make progress.
39 | // It makes no guarantee of a "fair" rotation among clients, however,
40 | // or that some particularly slow or otherwise unlucky client will not starve.
41 | //
42 | // While this package currently lives in the tlc repository,
43 | // it is not particularly specific to TLC and depends on nothing else in it,
44 | // and hence might eventually be moved to a more generic home if appropriate.
45 | //
46 | // XXX describe the techniques in a bit more detail.
47 | //
48 | package verst
49 |
50 | import (
51 | "fmt"
52 | "io/ioutil"
53 | "os"
54 | "path/filepath"
55 | // "errors"
56 |
57 | "github.com/bford/cofo/cbe"
58 | "github.com/dedis/tlc/go/lib/fs/atomic"
59 | )
60 |
61 | //const versPerGen = 100 // Number of versions between generation subdirectories
62 | const versPerGen = 10 // Number of versions between generation subdirectories
63 |
64 | const genFormat = "gen-%d" // Format for generation directory names
65 | const verFormat = "ver-%d" // Format for register version file names
66 |
67 | // State holds cached state for a single verst versioned register.
68 | type State struct {
69 | path string // Base pathname of directory containing register state
70 | genVer int64 // Version number of highest generation subdirectory
71 | genPath string // Pathname to generation subdirectory
72 | ver int64 // Highest register version known to exist already
73 | val string // Cached register value for highest known version
74 | expVer int64 // Version number before which state is expired
75 | }
76 |
77 | // Initialize State to refer to a verst register at a given file system path.
78 | // If create is true, create the designated directory if it doesn't exist.
79 | // If excl is true, fail if the designated directory already exists.
80 | func (st *State) Init(path string, create, excl bool) error {
81 | *st = State{path: path} // Set path and clear cached state
82 |
83 | // First check if the path already exists and is a directory.
84 | stat, err := os.Stat(path)
85 | switch {
86 | case err == nil && !stat.IsDir():
87 | return os.ErrExist // already exists, but not a directory
88 |
89 | case err == nil && !excl:
90 | return st.refresh() // exists: load our cache from it
91 |
92 | case err != nil && (!IsNotExist(err) || !create):
93 | return err // didn't exist and we can't create it
94 | }
95 |
96 | // Create and initialize the version state directory,
97 | // initially with a temporary name for atomicity.
98 | dir, name := filepath.Split(path)
99 | if dir == "" {
100 | dir = "." // Ensure dir is nonempty
101 | }
102 | tmpPath, err := ioutil.TempDir(dir, name+"-*.tmp")
103 | if err != nil {
104 | return err
105 | }
106 | defer func() { // Clean up on return if we can't move it into place
107 | os.RemoveAll(tmpPath)
108 | }()
109 |
110 | // Create an initial generation directory for state version 0
111 | genPath := filepath.Join(tmpPath, fmt.Sprintf(genFormat, 0))
112 | err = os.Mkdir(genPath, 0777)
113 | if err != nil {
114 | return err
115 | }
116 |
117 | // Create an initial state version 0 with the empty string as its value
118 | err = writeVerFile(genPath, fmt.Sprintf(verFormat, 0), "", "")
119 | if err != nil {
120 | return err
121 | }
122 |
123 | // Atomically move the temporary version state directory into place.
124 | err = os.Rename(tmpPath, path)
125 | if err != nil && (excl || !IsExist(err)) {
126 | return err
127 | }
128 |
129 | // Finally, load our cache from the state directory.
130 | return st.refresh()
131 | }
132 |
133 | // Refresh our cached state in attempt to "catch up" to the
134 | // latest register version on the file system.
135 | // Of course the file system may be a constantly-moving target
136 | // so the refreshed state could be stale again immediately on return.
137 | func (st *State) refresh() error {
138 |
139 | // First find the highest-numbered state generation subdirectory
140 | genver, genname, _, err := scan(st.path, genFormat, 0)
141 | if err != nil {
142 | return err
143 | }
144 |
145 | // Then find the highest-numbered register version in that subdirectory
146 | genpath := filepath.Join(st.path, genname)
147 | regver, regname, _, err := scan(genpath, verFormat, 0)
148 | if err != nil {
149 | return err
150 | }
151 |
152 | // Read that highest register version file
153 | val, _, err := readVerFile(genpath, regname)
154 | if err != nil {
155 | return err
156 | }
157 |
158 | st.genVer = genver
159 | st.genPath = genpath
160 |
161 | st.ver = regver
162 | st.val = val
163 |
164 | return nil
165 | }
166 |
167 | // Scan a directory for highest-numbered file or subdirectory matching format.
168 | // If upTo > 0, returns the highest-numbered version no higher than upTo.
169 | func scan(path, format string, upTo int64) (
170 | maxver int64, maxname string, names []string, err error) {
171 |
172 | // Scan the verst directory for the highest-numbered subdirectory.
173 | dir, err := os.Open(path)
174 | if err != nil {
175 | return 0, "", nil, err
176 | }
177 | info, err := dir.Readdir(0)
178 | if err != nil {
179 | return 0, "", nil, err
180 | }
181 |
182 | // Find the highest-numbered generation subdirectory
183 | maxver = -1
184 | for i := range info {
185 | name := info[i].Name()
186 |
187 | // Scan the version number embedded in the name, if any,
188 | // and confirm that the filename exactly matches the format.
189 | var ver int64
190 | n, err := fmt.Sscanf(name, format, &ver)
191 | if n < 1 || err != nil || name != fmt.Sprintf(format, ver) {
192 | continue
193 | }
194 |
195 | // Find the highest extant version number
196 | // (no greater than upTo, if upTo is nonzero)
197 | if ver > maxver && (upTo == 0 || ver <= upTo) {
198 | maxver, maxname = ver, name
199 | }
200 |
201 | // If upTo is nonzero, collect all the matching names.
202 | if upTo > 0 && ver <= upTo {
203 | names = append(names, name)
204 | }
205 | }
206 | if maxver < 0 { // No highest version!? oops
207 | return 0, "", nil, os.ErrNotExist
208 | }
209 | return
210 | }
211 |
212 | // Read and parse the register version file at regpath.
213 | func readVerFile(genPath, verName string) (val, nextGen string, err error) {
214 |
215 | regPath := filepath.Join(genPath, verName)
216 | b, err := ioutil.ReadFile(regPath)
217 | if err != nil {
218 | return "", "", err
219 | }
220 |
221 | // The encoded value is always first and not optional
222 | rb, b, err := cbe.Decode(b)
223 | if err != nil {
224 | println("corrupt verst version file " + regPath)
225 | return "", "", err
226 | }
227 |
228 | // The encoded next-generation directory name is optional
229 | nxg, b, err := cbe.Decode(b)
230 | // (ignore decoding errors)
231 |
232 | return string(rb), string(nxg), nil
233 | }
234 |
235 | // Read the latest version of the stored state,
236 | // returning both the highest version number (key) and associated value.
237 | // Of course a new version might be written at any time,
238 | // so the caller must assume this information could become stale immediately.
239 | func (st *State) ReadLatest() (ver int64, val string, err error) {
240 |
241 | if err := st.refresh(); err != nil {
242 | return 0, "", err
243 | }
244 | return st.ver, st.val, nil
245 | }
246 |
247 | // Read a specific version of the stored state,
248 | // returning the associated value if possible.
249 | // Returns ErrNotExist if the specified version does not exist,
250 | // either because it has never been written or because it has been expired.
251 | func (st *State) ReadVersion(ver int64) (val string, err error) {
252 |
253 | // In the common case of reading back the last-written version,
254 | // just return its value from our cache.
255 | if ver == st.ver {
256 | return st.val, nil
257 | }
258 |
259 | // Find and read the appropriate version file
260 | val, err = st.readUncached(ver)
261 | if err != nil {
262 | return "", err
263 | }
264 |
265 | // Update our cached state as appropriate.
266 | if ver > st.ver {
267 | st.ver = ver
268 | st.val = val
269 | }
270 |
271 | return val, nil
272 | }
273 |
274 | func (st *State) readUncached(ver int64) (val string, err error) {
275 |
276 | // Optimize for sequential reads of the "next" version
277 | verName := fmt.Sprintf(verFormat, ver)
278 | if ver >= st.genVer {
279 | val, _, err := readVerFile(st.genPath, verName)
280 | if err == nil {
281 | return val, nil // success
282 | }
283 | if !IsNotExist(err) {
284 | return "", err // error other than non-existent
285 | }
286 | }
287 |
288 | // Fallback: scan for the generation containing requested version.
289 | //println("readUncached: fallback at", ver)
290 | genVer, genName, _, err := scan(st.path, genFormat, ver)
291 | if err != nil {
292 | return "", err
293 | }
294 | //println("readUncached: found", ver, "in gen", genVer)
295 |
296 | // The requested version should be in directory genName if it exists.
297 | genPath := filepath.Join(st.path, genName)
298 | val, _, err = readVerFile(genPath, verName)
299 | if err != nil {
300 | return "", err
301 | }
302 |
303 | // Update our cached generation state
304 | if ver >= st.ver {
305 | println("moving to generation", genVer, "at ver", ver)
306 | st.genVer = genVer
307 | st.genPath = genPath
308 | }
309 |
310 | return val, err
311 | }
312 |
313 | // Write version ver with associated value val if ver is not yet written.
314 | // The caller may skip version numbers, e.g., to catch up a delayed store,
315 | // but must never try to (re-)write older versions up to the last written.
316 | //
317 | func (st *State) WriteVersion(ver int64, val string) (err error) {
318 |
319 | if ver <= st.ver {
320 | return ErrExist
321 | }
322 | verName := fmt.Sprintf(verFormat, ver)
323 |
324 | // Should this register version start a new generation?
325 | tmpGenName := ""
326 | if ver%versPerGen == 0 {
327 |
328 | // Prepare the new generation in a temporary directory first
329 | pattern := fmt.Sprintf(genFormat+"-*.tmp", ver)
330 | tmpPath, err := ioutil.TempDir(st.path, pattern)
331 | if err != nil {
332 | return err
333 | }
334 | defer func() {
335 | os.RemoveAll(tmpPath)
336 | }()
337 | tmpGenName = filepath.Base(tmpPath)
338 |
339 | // Write the new register version in the new directory (too)
340 | err = writeVerFile(tmpPath, verName, val, tmpGenName)
341 | if err != nil {
342 | return err
343 | }
344 | }
345 |
346 | // Write version into the (old) generation directory
347 | err = writeVerFile(st.genPath, verName, val, tmpGenName)
348 | if err != nil && !IsExist(err) {
349 | return err
350 | }
351 |
352 | // Read back whatever register version file actually got written,
353 | // which might be from someone else's write that won over ours.
354 | val, tmpGenName, err = readVerFile(st.genPath, verName)
355 | if err != nil {
356 | return err
357 | }
358 |
359 | // If the (actual) new version indicates a new generation directory,
360 | // try to move the temporary directory into its place.
361 | // It's harmless if multiple writers attempt this redundantly:
362 | // it fails if either the old temporary directory no longer exists
363 | // or if a directory with the new name already exists.
364 | if tmpGenName != "" {
365 | oldGenPath := filepath.Join(st.path, tmpGenName)
366 | newGenPath := filepath.Join(st.path,
367 | fmt.Sprintf(genFormat, ver))
368 | err := os.Rename(oldGenPath, newGenPath)
369 | if err != nil && !IsExist(err) && !IsNotExist(err) {
370 | return err
371 | }
372 |
373 | // It's a good time to expire old generations when feasible
374 | st.expireOld()
375 |
376 | // Update our cached generation state
377 | st.genVer = ver
378 | st.genPath = newGenPath
379 | }
380 |
381 | // Update our cached version state
382 | st.ver = ver
383 | st.val = val
384 | return nil
385 | }
386 |
387 | func writeVerFile(genPath, verName, val, nextGen string) error {
388 |
389 | // Encode the new register version file
390 | b := cbe.Encode(nil, []byte(val))
391 | b = cbe.Encode(b, []byte(nextGen))
392 |
393 | // Write it atomically
394 | verPath := filepath.Join(genPath, verName)
395 | if err := atomic.WriteFileOnce(verPath, b, 0644); err != nil {
396 | return err
397 | }
398 |
399 | return nil
400 | }
401 |
402 | // Expire indicates that state versions earlier than before may be deleted.
403 | // It does not necessarily delete these older versions immediately, however.
404 | // Attempts either to read or to write expired versions will fail.
405 | //
406 | func (st *State) Expire(before int64) {
407 | if st.expVer < before {
408 | st.expVer = before
409 | }
410 | }
411 |
412 | // Actually try to delete expired versions.
413 | // We do this only about once per generation for efficiency.
414 | func (st *State) expireOld() {
415 |
416 | // Find all existing generation directories up to version 'before'
417 | maxVer, maxName, names, err := scan(st.path, genFormat, st.expVer)
418 | if err != nil || len(names) == 0 {
419 | return // ignore errors, e.g., no expired generations
420 | }
421 | if maxVer < 0 || maxVer > st.expVer {
422 | println("expireOld oops", len(names), maxVer, st.expVer)
423 | panic("shouldn't happen")
424 | }
425 |
426 | // Delete all generation directories before maxVer,
427 | // since those can only contain versions strictly before maxVer.
428 | for _, genName := range names {
429 | if genName != maxName {
430 | genPath := filepath.Join(st.path, genName)
431 | atomicRemoveAll(genPath)
432 | }
433 | }
434 | }
435 |
436 | // Atomically remove the directory at path,
437 | // ensuring that no one sees inconsistent states within it,
438 | // by renaming it before starting to delete its contents.
439 | func atomicRemoveAll(path string) error {
440 |
441 | tmpPath := fmt.Sprintf("%s.old", path)
442 | if err := os.Rename(path, tmpPath); err != nil {
443 | return err
444 | }
445 |
446 | return os.RemoveAll(tmpPath)
447 | }
448 |
449 | // State.Write returns an error matching this predicate
450 | // when the version the caller asked to write already exists.
451 | func IsExist(err error) bool {
452 | return os.IsExist(err)
453 | }
454 |
455 | // State.Read returns an error matching this predicat
456 | // when the version the caller asked to read does not exist.
457 | func IsNotExist(err error) bool {
458 | return os.IsNotExist(err)
459 | }
460 |
461 | var ErrExist = os.ErrExist
462 | var ErrNotExist = os.ErrNotExist
463 |
--------------------------------------------------------------------------------
/go/model/README.md:
--------------------------------------------------------------------------------
1 | This Go package provides a minimal implementation of
2 | Que Sera Consensus (QSC) built on Threshold Logical Clocks (TLC)
3 | for fail-stop, non-Byzantine environments.
4 | For background information on QSC and TLC,
5 | and other model implementations in several languages, please see the
6 | [top level of this repository](https://github.com/dedis/tlc/).
7 | For more details on this package see the code and its
8 | [GoDoc documentation](https://godoc.org/github.com/dedis/tlc/go/model).
9 |
--------------------------------------------------------------------------------
/go/model/doc.go:
--------------------------------------------------------------------------------
1 | // Package model implements a simple pedagogic model of TLC and QSC.
2 | // It uses no cryptography and supports only failstop, non-Byzantine consensus,
3 | // but should be usable in scenarios that would typically employ Paxos or Raft.
4 | //
5 | // This implementation is less than 200 lines of actual code as counted by CLOC,
6 | // so a good way to understand it is to read the code directly at
7 | // https://github.com/dedis/tlc/tree/master/go/model.
8 | // You can test this implementation in a variety of consensus configurations
9 | // using only goroutines and channels for communication via:
10 | //
11 | // go test -v
12 | //
13 | // To read about the principles underlying TLC and QSC, please refer to
14 | // https://arxiv.org/abs/1907.07010.
15 | // For a high-level overview of the different implementations of TLC/QSC
16 | // in different languages that live in this repository, please see
17 | // https://github.com/dedis/tlc/.
18 | //
19 | // Configuring and launching consensus groups
20 | //
21 | // To use this implementation of QSC,
22 | // a user of this package must first configure and launch
23 | // a threshold group of nodes.
24 | // This package handles only the core consensus logic,
25 | // leaving matters such as node configuration, network names, connections,
26 | // and wire-format marshaling and unmarshaling to the client of this package.
27 | //
28 | // The client using this package
29 | // must assign each node a unique number from 0 through nnode-1,
30 | // e.g., by configuring the group with a well-known ordering of its members.
31 | // Only node numbers are important to this package; it is oblivious to names.
32 | //
33 | // When each node in the consensus group starts,
34 | // the client calls NewNode to initialize the node's TLC and QSC state.
35 | // The client may then change optional Node configuration parameters,
36 | // such as Node.Rand, before actually commencing protocol message processing.
37 | // The client then calls Node.Advance to launch TLC and the consensus protocol,
38 | // advance to time-step zero, and broadcast a proposal for this time-step.
39 | // Thereafter, the protocol self-clocks asynchronously using TLC
40 | // based on network communication.
41 | //
42 | // Consensus protocol operation and results
43 | //
44 | // This package implements QSC in pipelined fashion, which means that
45 | // a sliding window of three concurrent QSC rounds is active at any time.
46 | // At the start of any given time step s when Advance broadcasts a Raw message,
47 | // this event initiates a new consensus round starting at s and ending at s+3,
48 | // and (in the steady state) completes a consensus round that started at s-3.
49 | // Each Message a node broadcasts includes QSC state from four rounds:
50 | // Message.QSC[0] holds the results of the consensus round just completed,
51 | // while QSC[1] through QSC[3] hold the state of the three still-active rounds,
52 | // with QSC[3] being the newest round just launched.
53 | //
54 | // If Message.QSC[0].Commit is true in the Raw message commencing a time-step,
55 | // then this node saw the round ending at step Message.Step as fully committed.
56 | // In this case, all nodes will have agreed on the same proposal in that round,
57 | // which is the proposal made by node number Message.QSC[0].Conf.From.
58 | // If the client was waiting for a particular transaction to be ordered
59 | // or definitely committed/aborted according to the client's transaction rules,
60 | // then seeing that Message.QSC[0].Commit is true means that the client may
61 | // resolve the status of transactions proposed up to Message.Step-3.
62 | // Other nodes might not have observed this same round as committed, however,
63 | // so the client must not assume that other nodes also necessarily be aware
64 | // that this consensus round successfully committed.
65 | //
66 | // If Message.QSC[0].Commit is false, the round may or may not have converged:
67 | // this node simply cannot determine conclusively whether the round converged.
68 | // Other nodes might have chosen different "best confirmed" proposals,
69 | // as indicated in their respective QSC[0].Conf.From broadcasts for this step.
70 | // Alternatively, the round may in fact have converged,
71 | // and other nodes might observe that fact, even though this node did not.
72 | //
73 | // Message transmission, marshaling
74 | //
75 | // This package invokes the send function provided to NewNode to send messages,
76 | // leaving any wire-format marshaling required to the provided function.
77 | // This allows the client complete control over the desired wire format,
78 | // and to include other information beyond the fields defined in Message,
79 | // such as any semantic content on which the client wishes to achieve consensus.
80 | // On receipt of a message from another node,
81 | // the client must unmarshal it as appropriate
82 | // and invoke Node.Receive with the unmarshalled Message.
83 | //
84 | // Concurrency control
85 | //
86 | // The consensus protocol logic in this package is not thread safe:
87 | // it must be run in a single goroutine,
88 | // or else the client must implement appropriate locking.
89 | //
90 | package model
91 |
--------------------------------------------------------------------------------
/go/model/model_test.go:
--------------------------------------------------------------------------------
1 | package model
2 |
3 | import (
4 | "fmt"
5 | "math/rand"
6 | "sync"
7 | "testing"
8 | )
9 |
10 | func (n *Node) run(maxSteps int, peer []chan *Message, wg *sync.WaitGroup) {
11 |
12 | // broadcast message for initial time step s=0
13 | n.Advance() // broadcast message for initial time step
14 |
15 | // run the required number of time steps for the test
16 | for n.m.Step < maxSteps {
17 | msg := <-peer[n.m.From] // Receive a message
18 | n.Receive(msg) // Process it
19 | }
20 |
21 | // signal that we're done
22 | wg.Done()
23 | }
24 |
25 | // Run a consensus test case with the specified parameters.
26 | func testRun(t *testing.T, thres, nnode, maxSteps, maxTicket int) {
27 | if maxTicket == 0 { // Default to moderate-entropy tickets
28 | maxTicket = 10 * nnode
29 | }
30 | desc := fmt.Sprintf("T=%v,N=%v,Steps=%v,Tickets=%v",
31 | thres, nnode, maxSteps, maxTicket)
32 | t.Run(desc, func(t *testing.T) {
33 | all := make([]*Node, nnode)
34 | peer := make([]chan *Message, nnode)
35 | send := func(dst int, msg *Message) { peer[dst] <- msg }
36 |
37 | for i := range all { // Initialize all the nodes
38 | peer[i] = make(chan *Message, 3*nnode*maxSteps)
39 | all[i] = NewNode(i, thres, nnode, send)
40 | if maxTicket > 0 {
41 | all[i].Rand = func() int64 {
42 | return rand.Int63n(int64(maxTicket))
43 | }
44 | }
45 | }
46 | wg := &sync.WaitGroup{}
47 | for _, n := range all { // Run the nodes on separate goroutines
48 | wg.Add(1)
49 | go n.run(maxSteps, peer, wg)
50 | }
51 | wg.Wait()
52 | testResults(t, all) // Report test results
53 | })
54 | }
55 |
56 | // Dump the consensus state of node n in round s
57 | func (n *Node) testDump(t *testing.T, s, nnode int) {
58 | r := &n.m.QSC[s]
59 | t.Errorf("%v %v conf %v %v re %v %v spoil %v %v",
60 | n.m.From, s, r.Conf.From, r.Conf.Tkt,
61 | r.Reconf.From, r.Reconf.Tkt, r.Spoil.From, r.Spoil.Tkt)
62 | }
63 |
64 | // Globally sanity-check and summarize each node's observed results.
65 | func testResults(t *testing.T, all []*Node) {
66 | for i, ni := range all {
67 | commits := 0
68 | for s, si := range ni.m.QSC {
69 | if si.Commit {
70 | commits++
71 | for _, nj := range all { // verify consensus
72 | if nj.m.QSC[s].Conf.From !=
73 | si.Conf.From {
74 |
75 | t.Errorf("%v %v UNSAFE", i, s)
76 | ni.testDump(t, s, len(all))
77 | nj.testDump(t, s, len(all))
78 | }
79 | }
80 | }
81 | }
82 | t.Logf("node %v committed %v of %v (%v%% success rate)",
83 | i, commits, len(ni.m.QSC), (commits*100)/len(ni.m.QSC))
84 | }
85 | }
86 |
87 | // Run QSC consensus for a variety of test cases.
88 | func TestQSC(t *testing.T) {
89 | testRun(t, 1, 1, 100000, 0) // Trivial case: 1 of 1 consensus!
90 | testRun(t, 2, 2, 100000, 0) // Another trivial case: 2 of 2
91 |
92 | testRun(t, 2, 3, 100000, 0) // Standard f=1 case
93 | testRun(t, 3, 5, 100000, 0) // Standard f=2 case
94 | testRun(t, 4, 7, 10000, 0) // Standard f=3 case
95 | testRun(t, 5, 9, 10000, 0) // Standard f=4 case
96 | testRun(t, 11, 21, 10000, 0) // Standard f=10 case
97 |
98 | testRun(t, 3, 3, 100000, 0) // Larger-than-minimum thresholds
99 | testRun(t, 6, 7, 10000, 0)
100 | testRun(t, 9, 10, 10000, 0)
101 |
102 | // Test with low-entropy tickets: hurts commit rate, but still safe!
103 | testRun(t, 2, 3, 100000, 1) // Limit case: will never commit
104 | testRun(t, 2, 3, 100000, 2) // Extreme low-entropy: rarely commits
105 | testRun(t, 2, 3, 100000, 3) // A bit better bit still bad...
106 | }
107 |
--------------------------------------------------------------------------------
/go/model/node.go:
--------------------------------------------------------------------------------
1 | package model
2 |
3 | import "math/rand"
4 |
5 | // Type represents the type of a QSC message: either Raw, Ack, or Wit.
6 | //
7 | // At the start of each time step a node broadcasts a Raw message,
8 | // which proposes a block for the consensus round starting at this step
9 | // and solicits witness acknowledgments to the proposal.
10 | //
11 | // Nodes that receive a Raw message during the same time step
12 | // reply with a unicast Ack message to Raw message's sender,
13 | // acknowledging that they have seen the sender's proposal
14 | // and merged in its QSC state.
15 | //
16 | // Once a node has received a threshold of Ack messages to its Raw proposal,
17 | // the node broadcasts a Wit message to announce that its proposal is witnessed.
18 | // Nodes wait to collect a threshold of Wit messages as their condition
19 | // to advance to the next time step and broadcast their next Raw message.
20 | //
21 | type Type int
22 |
23 | const (
24 | // Raw unwitnessed QSC proposal
25 | Raw Type = iota
26 | // Ack is the acknowledgment of a proposal
27 | Ack
28 | // Wit is the threshold witness confirmation of a proposal
29 | Wit
30 | )
31 |
32 | // Message contains the information nodes must pass in messages
33 | // both to run the TLC clocking protocol and achieve QSC consensus.
34 | //
35 | // This implementation of QSC performs no message marshaling or unmarshalling;
36 | // the client using it must handle message wire-format serialization.
37 | // However, the Message struct is defined so as to be compatible with
38 | // standard Go encoders such as encoding/gob or encoding/json.
39 | // The client may also marshal/unmarshal its own larger message struct
40 | // containing a superset of the information here,
41 | // such as to attach semantic content in some form to consensus proposals.
42 | type Message struct {
43 | From int // Node number of node that sent this message
44 | Step int // Logical time step this message is for
45 | Type Type // Message type: Prop, Ack, or Wit
46 | Tkt uint64 // Genetic fitness ticket for consensus
47 | QSC []Round // QSC consensus state for rounds ending at Step or later
48 | }
49 |
50 | // Node contains per-node state and configuration for TLC and QSC.
51 | // Use NewNode to create and properly initialize an instance
52 | // with the mandatory configuration parameters.
53 | // Public fields in this struct are optional configuration settings,
54 | // which NewNode initializes to defaults but the caller may change
55 | // after calling NewNode but before commencing protocol execution.
56 | //
57 | // Consensus uses the configurable Rand function to choose "genetic fitness"
58 | // lottery tickets for each node's proposal in each round.
59 | // Only the low 63 bits of the returned int64 are used.
60 | // This defaults to using the system's math/rand.Int63().
61 | // To tolerate sophisticated network denial-of-service attackers,
62 | // a full implementation should use cryptographic randomness
63 | // and hide the tickets from the network using encryption (e.g., TLS).
64 | //
65 | // The Rand function must not be changed once the Node is in operation.
66 | // All nodes must use the same nonnegative random number distribution.
67 | // Ticket collisions are not a problem as long as they are rare,
68 | // which is why 63 bits of entropy is sufficient.
69 | //
70 | type Node struct {
71 | m Message // Template for messages we send
72 |
73 | thres int // TLC message and witness thresholds
74 | nnode int // Total number of nodes
75 | send func(peer int, msg *Message) // Function to send message to a peer
76 |
77 | acks int // # acknowledgments we've received in this step
78 | wits int // # threshold witnessed messages seen this step
79 |
80 | Rand func() int64 // Function to generate random genetic fitness tickets
81 | }
82 |
83 | // NewNode creates and initializes a new Node with the specified group configuration.
84 | // The parameters to NewNode are the mandatory Node configuration parameters:
85 | // self is this node's number, thres is the TLC message and witness threshold,
86 | // nnode is the total number of nodes,
87 | // and send is a function to send a Message to a given peer node number.
88 | //
89 | // Optional configuration is represented by fields in the created Node struct,
90 | // which the caller may modify before commencing the consensus protocol.
91 | //
92 | func NewNode(self, thres, nnode int, send func(peer int, msg *Message)) (n *Node) {
93 | return &Node{
94 | m: Message{From: self, Step: -1,
95 | QSC: make([]Round, 3)}, // "rounds" ending in steps 0-2
96 | thres: thres, nnode: nnode, send: send,
97 | Rand: rand.Int63}
98 | }
99 |
--------------------------------------------------------------------------------
/go/model/qsc.go:
--------------------------------------------------------------------------------
1 | package model
2 |
3 | // Best is a record representing either a best confirmed proposal,
4 | // or a best potential spoiler competing with the best confirmed proposal,
5 | // used in the Round struct.
6 | //
7 | // In each case, the only information we really need is
8 | // the genetic fitness lottery ticket of the "best" proposal seen so far,
9 | // and which node produced that proposal.
10 | // This optimization works only in the non-Byzantine QSC consensus protocol,
11 | // because Byzantine consensus requires that the lottery tickets be
12 | // unknown and unbiasable to everyone until the consensus round completes.
13 | //
14 | // When we're collecting the best potential spoiler proposal -
15 | // the proposal with the highest ticket regardless of whether it's confirmed -
16 | // we must keep track of ticket collisions,
17 | // in case one colliding proposal might "win" if not spoiled by the other.
18 | // When we detect a spoiler collision, we simply set From to -1,
19 | // an invalid node number that will be unequal to, and hence properly "spoil",
20 | // a confirmed or reconfirmed proposal with the same ticket from any node.
21 | //
22 | type Best struct {
23 | From int // Node the proposal is from (spoiler: -1 for tied tickets)
24 | Tkt uint64 // Proposal's genetic fitness ticket
25 | }
26 |
27 | // Find the Best of two records primarily according to highest ticket number.
28 | // For spoilers, detect and record ticket collisions with invalid node number.
29 | func (b *Best) merge(o *Best, spoiler bool) {
30 | if o.Tkt > b.Tkt {
31 | *b = *o // strictly better ticket
32 | } else if o.Tkt == b.Tkt && o.From != b.From && spoiler {
33 | b.From = -1 // record ticket collision
34 | }
35 | }
36 |
37 | // Round encapsulates all the QSC state needed for one consensus round:
38 | // the best potential "spoiler" proposal regardless of confirmation status,
39 | // the best confirmed (witnessed) proposal we've seen so far in the round,
40 | // and the best reconfirmed (double-witnessed) proposal we've seen so far.
41 | // Finally, at the end of the round, we set Commit to true if
42 | // the best confirmed proposal in Conf has definitely been committed.
43 | type Round struct {
44 | Spoil Best // Best potential spoiler(s) we've found so far
45 | Conf Best // Best confirmed proposal we've found so far
46 | Reconf Best // Best reconfirmed proposal we've found so far
47 | Commit bool // Whether we confirm this round successfully committed
48 | }
49 |
50 | // Merge QSC round info from an incoming message into our round history
51 | func mergeQSC(b, o []Round) {
52 | for i := range b {
53 | b[i].Spoil.merge(&o[i].Spoil, true)
54 | b[i].Conf.merge(&o[i].Conf, false)
55 | b[i].Reconf.merge(&o[i].Reconf, false)
56 | }
57 | }
58 |
59 | // The TLC layer upcalls this method on advancing to a new time-step,
60 | // with sets of proposals recently seen (saw) and threshold witnessed (wit).
61 | func (n *Node) advanceQSC() {
62 |
63 | // Choose a fresh genetic fitness ticket for this proposal
64 | n.m.Tkt = uint64(n.Rand()) | (1 << 63) // Ensure it's greater than zero
65 |
66 | // Initialize consensus state for the round starting at step.
67 | // Find best spoiler, breaking ticket ties in favor of higher node
68 | newRound := Round{Spoil: Best{From: n.m.From, Tkt: n.m.Tkt}}
69 | n.m.QSC = append(n.m.QSC, newRound)
70 |
71 | // Decide if the just-completed consensus round successfully committed.
72 | r := &n.m.QSC[n.m.Step]
73 | r.Commit = r.Conf.From == r.Reconf.From && r.Conf.From == r.Spoil.From
74 | }
75 |
76 | // TLC layer upcalls this to inform us that our proposal is threshold witnessed
77 | func (n *Node) witnessedQSC() {
78 |
79 | // Our proposal is now confirmed in the consensus round just starting
80 | // Find best confirmed proposal, breaking ties in favor of lower node
81 | myBest := &Best{From: n.m.From, Tkt: n.m.Tkt}
82 | n.m.QSC[n.m.Step+3].Conf.merge(myBest, false)
83 |
84 | // Find reconfirmed proposals for the consensus round that's in step 1
85 | n.m.QSC[n.m.Step+2].Reconf.merge(&n.m.QSC[n.m.Step+2].Conf, false)
86 | }
87 |
--------------------------------------------------------------------------------
/go/model/qscod/README.md:
--------------------------------------------------------------------------------
1 | This Go package provides a minimal implementation of
2 | Que Sera Consensus (QSC) built on Threshold Logical Clocks (TLC)
3 | for fail-stop, non-Byzantine environments.
4 | For background information on QSC and TLC,
5 | and other model implementations in several languages, please see the
6 | [top level of this repository](https://github.com/dedis/tlc/).
7 | For more details on this package see the code and its
8 | [GoDoc documentation](https://godoc.org/github.com/dedis/tlc/go/model/qscod).
9 |
--------------------------------------------------------------------------------
/go/model/qscod/core/cli.go:
--------------------------------------------------------------------------------
1 | // Package core implements the minimal core of the QSCOD consensus algorithm.
2 | // for client-driven "on-demand" consensus.
3 | //
4 | // This implementation of QSCOD builds on the TLCB and TLCR
5 | // threshold logical clock algorithms.
6 | // These algorithms are extremely simple but do impose one constraint:
7 | // the number of failing nodes must be at most one-third the group size.
8 | //
9 | // The unit tests for this package is in the test sub-package,
10 | // so that useful test framework code can be shared with other packages
11 | // without requiring any of it to be imported into development builds.
12 | // (Too bad Go doesn't allow packages to export and import test code.)
13 | //
14 | package core
15 |
16 | //import "fmt"
17 | import "sync"
18 | import "context"
19 |
20 | // Store represents an interface to one of the n key/value stores
21 | // representing the persistent state of each of the n consensus group members.
22 | // A Store's keys are integer TLC time-steps,
23 | // and its values are Value structures.
24 | //
25 | // WriteRead(step, value) attempts to write tv to the store at step v.S,
26 | // returning the first value written by any client.
27 | // WriteRead may also return a value from any higher time-step,
28 | // if other clients have moved the store's state beyond v.S.
29 | //
30 | // This interface intentionally provides no means to return an error.
31 | // If WriteRead encounters an error that might be temporary or recoverable,
32 | // then it should just keep trying (perhaps with appropriate backoff).
33 | // This is the fundamental idea of asynchronous fault tolerant consensus:
34 | // to tolerate individual storage node faults, persistently without giving up,
35 | // waiting for as long as it takes for the store to become available again.
36 | //
37 | // If the application encounters an error that warrants a true global failure,
38 | // then it should arrange for the Up function to return an error,
39 | // which will eventually cause all the worker threads to terminate.
40 | // In this case, the application can cancel any active WriteRead calls,
41 | // which may simply return the value v that was requested to be written
42 | // in order to allow the per-node worker thread to terminate cleanly.
43 | //
44 | type Store interface {
45 | WriteRead(v Value) Value
46 | }
47 |
48 | // Value represents the values that a consensus node's key/value Store maps to.
49 | type Value struct {
50 | S int64 // TLC step number this broadcast value is for
51 | P string // Application data string for this proposal
52 | I int64 // Random integer priority for this proposal
53 | R, B Set // Read set and broadcast set from TLCB
54 | }
55 |
56 | // Set represents a set of proposed values from the same time-step,
57 | // indexed by integer node numbers.
58 | type Set map[int]Value
59 |
60 | // best returns some maximum-priority Value in a Set,
61 | // together with a flag indicating whether the returned history
62 | // is uniquely the best, i.e., the set contains no history tied for best.
63 | func (S Set) best() (bn int, bv Value, bu bool) {
64 | for n, v := range S {
65 | if v.I >= bv.I {
66 | // A new best value is unique (so far)
67 | // if its priority is strictly higher than the last,
68 | // or if it has equal priority, was unique so far,
69 | // and is proposing identical application data.
70 | bn, bv, bu = n, v, v.I > bv.I || (bu && v.P == bv.P)
71 | }
72 | }
73 | return bn, bv, bu
74 | }
75 |
76 | // Client represents a logical client that can propose transactions
77 | // to the consensus group and drive the QSC/TLC state machine forward
78 | // asynchronously across the key/value storesu defining the group's state.
79 | //
80 | // The caller must initialize the public variables
81 | // to represent a valid QSCOD configuration,
82 | // before invoking Client.Run to run the consensus algorithm.
83 | // The public configuration variables must not be changed
84 | // after starting the client.
85 | //
86 | // KV is a slice containing interfaces to each of the key/value stores
87 | // that hold the persistent state of each node in the consensus group.
88 | // The total number of nodes N is defined to be len(KV).
89 | //
90 | // Tr and Ts are the receive and spread thresholds, respectively.
91 | // To ensure liveness against up to F slow or crashed nodes,
92 | // the receive threshold must exclude the F failed nodes: i.e., Tr <= N-F.
93 | // To ensure consistency (safety), the constrant Tr+Ts > N must hold.
94 | // Finally, to ensure that each round enjoys a high probability
95 | // of successful commitment, it should be the case that N >= 3F.
96 | // Thus, given F and N >= 3F, it is safe to set Tr = N-F and Ts = N-Tr+1.
97 | // The precise minimum threshold requirements are slightly more subtle,
98 | // but this is a safe and simpler configuration rule.
99 | //
100 | // Up is a callback function that the Client calls regularly while running,
101 | // to update the caller's knowledge of committed transactions
102 | // and to update the proposal data the client attempts to commit.
103 | // Client passes to Up the step numbers and proposal Data strings
104 | // for the last (predecessor) and current states known to be committed.
105 | // This known-committed proposal will change regularly across calls,
106 | // but may not change on each call and may not even be monotonic.
107 | // The Up function returns a string representing the new preferred
108 | // proposal Data that the Client will subsequently attempt to commit.
109 | // The Up function also returns an error which, if non-nil,
110 | // causes the Client's operation to terminate and return that error.
111 | //
112 | // RV is a function to generate non-negatative random numbers
113 | // for the symmetry-breaking priority values QSCOD requires.
114 | // In a production system, these random numbers should have high entropy
115 | // for maximum performance (minimum likelihood of collisions),
116 | // and should be generated from a cryptographically strong private source
117 | // for maximum protection against denial-of-service attacks in the network.
118 | //
119 | type Client struct {
120 | KV []Store // Per-node key/value state storage interfaces
121 | Tr, Ts int // Receive and spread threshold configuration
122 |
123 | Pr func(int64, string, bool) (string, int64) // Proposal function
124 |
125 | mut sync.Mutex // Mutex protecting this client's state
126 | }
127 |
128 | type work struct {
129 | cond *sync.Cond // For awaiting threshold conditions
130 | val Value // Value template each worker will try to write
131 | kvc Set // Key/value cache collected for this time-step
132 | max Value // Value with highest time-step we must catch up to
133 | next *work // Forward pointer to next work item
134 | }
135 |
136 | // Run starts a client running with its given configuration parameters,
137 | // proposing transactions and driving the consensus state machine continuously
138 | // forever or until the passed context is cancelled.
139 | //
140 | func (c *Client) Run(ctx context.Context) (err error) {
141 |
142 | // Keep the mutex locked whenever we're not waiting.
143 | c.mut.Lock()
144 | defer c.mut.Unlock()
145 |
146 | // Launch one client thread to drive each of the n consensus nodes.
147 | w := &work{kvc: make(Set), cond: sync.NewCond(&c.mut)}
148 | for i := range c.KV {
149 | go c.worker(i, w)
150 | }
151 |
152 | // Drive consensus state forever or until our context gets cancelled.
153 | for ; ctx.Err() == nil; w = w.next {
154 |
155 | // Wait for a threshold number of worker threads
156 | // to complete the current work-item
157 | for len(w.kvc) < c.Tr {
158 | w.cond.Wait()
159 | }
160 |
161 | //str := fmt.Sprintf("at %v kvc contains:", w.val.S)
162 | //for i, v := range w.kvc {
163 | // str += fmt.Sprintf(
164 | // "\n node %v step %v data %q pri %v R %v B %v",
165 | // i, v.S, v.P, v.I, len(v.R), len(v.B))
166 | //}
167 | //println(str)
168 |
169 | // Set the next work-item pointer in the current work-item,
170 | // so that the worker threads know there will be a next item.
171 | w.next = &work{kvc: make(Set), cond: sync.NewCond(&c.mut)}
172 |
173 | // Wake up worker threads waiting for a next item to appear
174 | w.cond.Broadcast()
175 |
176 | // Decide on the next step number and value to broadcast,
177 | // based on the threshold set we collected,
178 | // which is now immutable and consistent across threads.
179 | // v := Value{P:Head{Step:w.max.S+1}}
180 | nv := &w.next.val
181 | // v.S = w.max.S+1
182 | nv.S = w.val.S + 1
183 | switch {
184 |
185 | case w.max.S > w.val.S:
186 |
187 | // Some node already reached a higher time-step.
188 | // Our next work item is simply to catch up all nodes
189 | // at least to the highest-known step we discovered.
190 | //println("catching up from", w.val.S, "to", w.max.S)
191 | *nv = w.max
192 |
193 | case (w.val.S & 1) == 0: // finishing even-numbered step
194 |
195 | // Complete the first TLCR broadcast
196 | // and start the second within a TLCB round.
197 | // The value for the second broadcsast is simply
198 | // the threshold receive set from the first.
199 | nv.R = w.kvc
200 |
201 | case (w.val.S & 3) == 1:
202 |
203 | // Complete the first TLCB call in a QSCOD round
204 | // and start the second TLCB call for the round.
205 |
206 | // Calculate valid potential (still tentative)
207 | // R and B sets from the first TLCB call in this round,
208 | // and include them in the second TLCB broadcast.
209 | R0, B0 := c.tlcbRB(w.kvc)
210 |
211 | // Pick any best confirmed proposal from B0
212 | // as our broadcast for the second TLCB round.
213 | _, v2, _ := B0.best()
214 |
215 | // Set the value for the second TLCB call to broadcast
216 | nv.I, nv.R, nv.B = v2.I, R0, B0
217 |
218 | case (w.val.S & 3) == 3:
219 |
220 | // Complete a prior QSCOD round and start a new one.
221 |
222 | // First, calculate valid potential R2 and B2 sets from
223 | // the second TLCB call in the completed QSCOD round.
224 | R2, B2 := c.tlcbRB(w.kvc)
225 |
226 | // We always adopt some best confirmed proposal from R2
227 | // as our own (still tentative so far) view of history.
228 | // If this round successfully commits,
229 | // then our b2 will be the same as everyone else's,
230 | // even if we fail below to realize that fact.
231 | _, b2, _ := R2.best()
232 |
233 | // Find the best-known proposal b0 in some node's R0.
234 | // We can get an R0 set from the first round in b2.R.
235 | // Also determine if b0 was uniquely best in this R0.
236 | // Our R2 and B2 sets will be subsets of any valid R0.
237 | n0, b0, u0 := b2.R.best()
238 |
239 | // See if we can determine b2 to have been committed:
240 | // if b0==b2 is the uniquely-best eligible proposal.
241 | // This test may succeed only for some nodes in a round.
242 | // If b is uniquely-best in R0 we can compare priorities
243 | // to see if two values are the same node's proposal.
244 | // // Never commit proposals that don't change the Data,
245 | // // since we use those to represent "no-op" proposals.
246 | com := u0 && b0.I == b2.I && b0.I == B2[n0].I
247 | if com {
248 | // if u0 && b0.I == b2.I && b0.I == B2[n0].I &&
249 | // b0.P.Data != v.C.Data
250 |
251 | // b0.P is the original proposal with data,
252 | // which becomes the new current commit C.
253 | // The previous current commit
254 | // becomes the last commit L.
255 | //println("committed", b0.S, "data", b0.P)
256 | // v.L, v.C = v.C, b0.P
257 | }
258 |
259 | // Set the value for the first TLCB call
260 | // in the next QSCOD round to broadcast,
261 | // containing a proposal for the next round.
262 | nv.P, nv.I = c.Pr(b0.S, b0.P, com)
263 | }
264 |
265 | //fmt.Printf("at %v next step %v pri %v prop %q R %v B %v\n",
266 | // w.val.S, nv.S, nv.I, nv.P, len(nv.R), len(nv.B))
267 |
268 | //if nv.S < w.max.S {
269 | // println("no progress: s", w.val.S, "lv", w.max.S,
270 | // "to", nv.S)
271 | //}
272 | }
273 |
274 | // Signal the worker threads to terminate with an all-nil work-item
275 | w.next = &work{}
276 | w.cond.Broadcast()
277 |
278 | // Any slow client threads will continue in the background
279 | // until they catch up with the others or successfully get cancelled.
280 | return ctx.Err()
281 | }
282 |
283 | // worker handles a goroutine dedicated to submitting WriteRead requests
284 | // to each consensus group node asynchronously without delaying the main thread.
285 | //
286 | // We could in principle launch a separate goroutine per node each time step,
287 | // which would be even simpler to manage and provide higher parallelism.
288 | // But this would risk creating a ton of outstanding concurrent goroutines
289 | // trying to access the same slow node(s) and overloading those nodes further,
290 | // or creating local resource pressures such as too many open file descriptors
291 | // in case each WriteRead call opens a new file descriptor or socket, etc.
292 | // So we have only one worker per consensus group node do everything serially,
293 | // limiting resource usage while protecting the main thread from slow nodes.
294 | //
295 | func (c *Client) worker(node int, w *work) {
296 |
297 | // Keep Client state locked while we're not waiting
298 | c.mut.Lock()
299 |
300 | // Process work-items defined by the main thread in sequence,
301 | // terminating when we encounter a work-item with a nil kvc.
302 | for ; w.kvc != nil; w = w.next {
303 |
304 | // // Pull the next Value template we're supposed to write
305 | // v := w.val
306 |
307 | // // In steps that start a new QSC round with new proposals,
308 | // // each node gets its own independent random priority
309 | // // even when they're proposals of the same application value.
310 | // if (v.S & 3) == 0 {
311 | // v.I = c.RV()
312 | // }
313 |
314 | //println(w, "before WriteRead step", w.val.S)
315 |
316 | // Try to write new value, then read whatever the winner wrote.
317 | c.mut.Unlock()
318 | v := c.KV[node].WriteRead(w.val)
319 | c.mut.Lock()
320 |
321 | //println(w, "after WriteRead step", w.val.S, "read", v.S)
322 |
323 | //if v.S < w.val.S {
324 | // println("read back value from old step", v.S, w.val.S)
325 | //}
326 |
327 | // Collect a threshold number of last-step values in w.kvc,
328 | // after which work-item w will be considered complete.
329 | // Don't modify kvc or max after reaching the threshold tr,
330 | // because they are expected to be immutable afterwards.
331 | if len(w.kvc) < c.Tr {
332 |
333 | // Record the actual value read in the work-item
334 | w.kvc[node] = v
335 |
336 | // Track the highest last-step value read on any node,
337 | // which may be higher than the one we tried to write
338 | // if we need to catch up with a faster node.
339 | if v.S > w.max.S {
340 | w.max = v
341 | }
342 |
343 | // Wake up the main thread when we reach the threshold
344 | if len(w.kvc) == c.Tr {
345 | w.cond.Broadcast()
346 | }
347 | }
348 |
349 | // Wait until the main thread has created a next work-item.
350 | for w.next == nil {
351 | w.cond.Wait()
352 | }
353 | }
354 |
355 | c.mut.Unlock()
356 | }
357 |
358 | // tlcbRB calculates the receive (R) and broadcast (B) sets
359 | // returned by the TLCB algorithm after its second TLCR call.
360 | //
361 | // The returned R and B sets are only tentative,
362 | // representing possible threshold receive-set and broadcast-set outcomes
363 | // from this TLCB invocation, computed locally by this client.
364 | // These locally-computed sets cannot be relied on to be definite for this node
365 | // until the values computed from them are committed via Store.WriteRead.
366 | //
367 | func (c *Client) tlcbRB(kvc Set) (Set, Set) {
368 |
369 | // Using the tentative client-side receive-set from the second TLCR,
370 | // compute potential receive-set (R) and broadcast-set (B) sets
371 | // to return from TLCB.
372 | R, B, Bc := make(Set), make(Set), make([]int, len(c.KV))
373 | for _, v := range kvc {
374 | for j, vv := range v.R {
375 | R[j] = vv // R has all values we've seen
376 | Bc[j]++ // How many nodes have seen vv?
377 | if Bc[j] >= c.Ts { // B has only those reaching ts
378 | B[j] = vv
379 | }
380 | }
381 | }
382 | return R, B
383 | }
384 |
--------------------------------------------------------------------------------
/go/model/qscod/core/test/cli.go:
--------------------------------------------------------------------------------
1 | // Package test contains shareable code for testing instantiations of QSCOD.
2 | package test
3 |
4 | import (
5 | "context"
6 | "fmt"
7 | "math/rand"
8 | "sync"
9 | "testing"
10 |
11 | . "github.com/dedis/tlc/go/model/qscod/core"
12 | )
13 |
14 | // Object to record the common total order and verify it for consistency
15 | type testOrder struct {
16 | hist []string // all history known to be committed so far
17 | mut sync.Mutex // mutex protecting this reference order
18 | }
19 |
20 | // When a client reports a history h has been committed,
21 | // record that in the testOrder and check it for global consistency.
22 | func (to *testOrder) committed(t *testing.T, step int64, prop string) {
23 | to.mut.Lock()
24 | defer to.mut.Unlock()
25 |
26 | // Ensure history slice is long enough
27 | for step >= int64(len(to.hist)) {
28 | to.hist = append(to.hist, "")
29 | }
30 |
31 | // Check commit consistency across all concurrent clients
32 | switch {
33 | case to.hist[step] == "":
34 | to.hist[step] = prop
35 | case to.hist[step] != prop:
36 | t.Errorf("Inconsistency at %v:\n old %q\n new %q",
37 | step, to.hist[step], prop)
38 | }
39 | }
40 |
41 | // testCli creates a test client with particular configuration parameters.
42 | func testCli(t *testing.T, self, f, maxstep, maxpri int,
43 | kv []Store, to *testOrder, wg *sync.WaitGroup) {
44 |
45 | // Create a cancelable context for the test run
46 | ctx, cancel := context.WithCancel(context.Background())
47 |
48 | // Our proposal function simply collects and consistency-checks
49 | // committed Heads until a designated time-step is reached.
50 | pr := func(step int64, cur string, com bool) (string, int64) {
51 | //fmt.Printf("cli %v saw commit %v %q\n", self, C.Step, C.Data)
52 |
53 | // Consistency-check the history h known to be committed
54 | if com {
55 | to.committed(t, step, cur)
56 | }
57 |
58 | // Stop once we reach the step limit
59 | if step >= int64(maxstep) {
60 | cancel()
61 | }
62 |
63 | // Use the simple Int63n for random number generation,
64 | // with values constrained to be lower than maxpri for testing.
65 | // A real deployment should use cryptographic randomness
66 | // and should preferably be high-entropy,
67 | // close to the full 64 bits.
68 | pri := rand.Int63n(int64(maxpri))
69 |
70 | return fmt.Sprintf("cli %v proposal %v", self, step), pri
71 | }
72 |
73 | // Start the test client with appropriate parameters assuming
74 | // n=3f, tr=2f, tb=f, and ts=f+1, satisfying TLCB's constraints.
75 | c := Client{KV: kv, Tr: 2 * f, Ts: f + 1, Pr: pr}
76 | c.Run(ctx)
77 |
78 | wg.Done()
79 | }
80 |
81 | // Run a consensus test case on a given set of Store interfaces
82 | // and with the specified group configuration and test parameters.
83 | func TestRun(t *testing.T, kv []Store, nfail, ncli, maxstep, maxpri int) {
84 |
85 | // Create a reference total order for safety checking
86 | to := &testOrder{}
87 |
88 | desc := fmt.Sprintf("F=%v,N=%v,Clients=%v,Commits=%v,Tickets=%v",
89 | nfail, len(kv), ncli, maxstep, maxpri)
90 | t.Run(desc, func(t *testing.T) {
91 |
92 | // Simulate the appropriate number of concurrent clients
93 | wg := &sync.WaitGroup{}
94 | for i := 0; i < ncli; i++ {
95 | wg.Add(1)
96 | go testCli(t, i, nfail, maxstep, maxpri, kv, to, wg)
97 | }
98 | wg.Wait()
99 | })
100 | }
101 |
--------------------------------------------------------------------------------
/go/model/qscod/core/test/cli_test.go:
--------------------------------------------------------------------------------
1 | package test
2 |
3 | import (
4 | "sync"
5 | "testing"
6 |
7 | . "github.com/dedis/tlc/go/model/qscod/core"
8 | )
9 |
10 | // Trivial intra-process key-value store implementation for testing
11 | type testStore struct {
12 | mut sync.Mutex // synchronization for testStore state
13 | v Value // the latest value written
14 | }
15 |
16 | // WriteRead implements the Store interface with a simple intra-process map.
17 | func (ts *testStore) WriteRead(v Value) Value {
18 | ts.mut.Lock()
19 | defer ts.mut.Unlock()
20 |
21 | // Write value v only if it's newer than the last value written.
22 | if v.S > ts.v.S {
23 | ts.v = v
24 | }
25 |
26 | // Then return whatever was last written, regardless.
27 | return ts.v
28 | }
29 |
30 | // Run a consensus test case with the specified parameters.
31 | func testRun(t *testing.T, nfail, nnode, ncli, maxstep, maxpri int) {
32 |
33 | // Create a simple test key/value store representing each node
34 | kv := make([]Store, nnode)
35 | for i := range kv {
36 | kv[i] = &testStore{}
37 | }
38 |
39 | TestRun(t, kv, nfail, ncli, maxstep, maxpri)
40 | }
41 |
42 | // Test the Client with a trivial in-memory key/value Store implementation.
43 | func TestClient(t *testing.T) {
44 | testRun(t, 1, 3, 1, 100000, 100) // Standard f=1 case
45 | testRun(t, 1, 3, 2, 100000, 100)
46 | testRun(t, 1, 3, 10, 100000, 100)
47 | testRun(t, 1, 3, 20, 100000, 100)
48 | testRun(t, 1, 3, 50, 100000, 100)
49 | testRun(t, 1, 3, 100, 100000, 100)
50 |
51 | testRun(t, 2, 6, 10, 100000, 100) // Standard f=2 case
52 | testRun(t, 3, 9, 10, 100000, 100) // Standard f=3 case
53 | testRun(t, 4, 12, 10, 100000, 100) // Standard f=4 case
54 | testRun(t, 5, 15, 10, 100000, 100) // Standard f=10 case
55 |
56 | // Test with low-entropy tickets: hurts commit rate, but still safe!
57 | testRun(t, 1, 3, 10, 100000, 2) // Extreme low-entropy: rarely commits
58 | testRun(t, 1, 3, 10, 100000, 3) // A bit better bit still bad...
59 | }
60 |
--------------------------------------------------------------------------------
/go/model/qscod/encoding/enc.go:
--------------------------------------------------------------------------------
1 | // This package implements serialization of Values for QSCOD.
2 | // It currently just uses GOB encoding for simplicity,
3 | // but we should change that to something not Go-specific.
4 | package encoding
5 |
6 | import (
7 | "bytes"
8 | "encoding/gob"
9 |
10 | . "github.com/dedis/tlc/go/model/qscod/core"
11 | )
12 |
13 | // Encode a Value for serialized transmission.
14 | func EncodeValue(v Value) ([]byte, error) {
15 | buf := &bytes.Buffer{}
16 | enc := gob.NewEncoder(buf)
17 | if err := enc.Encode(v); err != nil {
18 | return nil, err
19 | }
20 | return buf.Bytes(), nil
21 | }
22 |
23 | // Decode a Value from its serialized format.
24 | func DecodeValue(b []byte) (v Value, err error) {
25 | r := bytes.NewReader(b)
26 | dec := gob.NewDecoder(r)
27 | err = dec.Decode(&v)
28 | return
29 | }
30 |
--------------------------------------------------------------------------------
/go/model/qscod/fs/casdir/cas_test.go:
--------------------------------------------------------------------------------
1 | // Package casdir tests CAS-based QSCOD over a set of file system CAS stores.
2 | package casdir
3 |
4 | import (
5 | "context"
6 | "fmt"
7 | "os"
8 | "testing"
9 |
10 | . "github.com/dedis/tlc/go/lib/cas"
11 | "github.com/dedis/tlc/go/lib/cas/test"
12 | "github.com/dedis/tlc/go/lib/fs/casdir"
13 | . "github.com/dedis/tlc/go/model/qscod/cas"
14 | )
15 |
16 | // Run a consensus test case with the specified parameters.
17 | func testRun(t *testing.T, nfail, nnode, nclients, nthreads, naccesses int) {
18 |
19 | // Create a test key/value store representing each node
20 | dirs := make([]string, nnode)
21 | for i := range dirs {
22 | dirs[i] = fmt.Sprintf("test-store-%d", i)
23 |
24 | // Remove the test directory if one is left-over
25 | // from a previous test run.
26 | os.RemoveAll(dirs[i])
27 |
28 | // Create the test directory afresh.
29 | fs := &casdir.Store{}
30 | if err := fs.Init(dirs[i], true, true); err != nil {
31 | t.Fatal(err)
32 | }
33 |
34 | // Clean it up once the test is done.
35 | defer os.RemoveAll(dirs[i])
36 | }
37 |
38 | desc := fmt.Sprintf("F=%v,N=%v,Clients=%v,Threads=%v,Accesses=%v",
39 | nfail, nnode, nclients, nthreads, naccesses)
40 | t.Run(desc, func(t *testing.T) {
41 |
42 | // Create a context and cancel it at the end of the test
43 | ctx, cancel := context.WithCancel(context.Background())
44 | defer cancel()
45 |
46 | // Create simulated clients to access the consensus group
47 | clients := make([]Store, nclients)
48 | for i := range clients {
49 |
50 | // Create a set of Store objects for each client
51 | members := make([]Store, nnode)
52 | for j := range members {
53 | fs := &casdir.Store{}
54 | err := fs.Init(dirs[j], false, false)
55 | if err != nil {
56 | t.Fatal(err)
57 | }
58 | members[j] = fs
59 | }
60 |
61 | clients[i] = (&Group{}).Start(ctx, members, nfail)
62 | }
63 |
64 | // Run a standard torture test across all the clients
65 | test.Stores(t, nthreads, naccesses, clients...)
66 | })
67 | }
68 |
69 | func TestConsensus(t *testing.T) {
70 | testRun(t, 1, 3, 1, 10, 10) // Standard f=1 case,
71 | testRun(t, 1, 3, 2, 10, 10) // varying number of clients
72 | testRun(t, 1, 3, 10, 10, 10)
73 | testRun(t, 1, 3, 20, 10, 10)
74 |
75 | testRun(t, 2, 6, 10, 10, 10) // Standard f=2 case
76 | testRun(t, 3, 9, 10, 10, 10) // Standard f=3 case
77 |
78 | // Note: when nnode * nclients gets to be around 120-ish,
79 | // we start running into default max-open-file limits.
80 | }
81 |
--------------------------------------------------------------------------------
/go/model/qscod/fs/simple/store.go:
--------------------------------------------------------------------------------
1 | // This package provides a simple file system key/value Store for QSCOD,
2 | // with no support for garbage collection.
3 | // It is intended only for education, testing, and experimentation,
4 | // and not for any production use.
5 | //
6 | package simple
7 |
8 | import (
9 | "context"
10 | "fmt"
11 | "io/ioutil"
12 | "os"
13 | "path/filepath"
14 |
15 | "github.com/dedis/tlc/go/lib/backoff"
16 | "github.com/dedis/tlc/go/lib/fs/atomic"
17 | . "github.com/dedis/tlc/go/model/qscod/core"
18 | "github.com/dedis/tlc/go/model/qscod/encoding"
19 | )
20 |
21 | // FileStore implements a simple QSCOD key/value store
22 | // as a directory in a file system.
23 | // The caller must create the directory designated by Path.
24 | //
25 | type FileStore struct {
26 | Path string // Directory to contain files representing key/value state
27 | }
28 |
29 | // Attempt to write the value v to a file associated with time-step step,
30 | // then read back whichever value was successfully written first.
31 | //
32 | // This implementation simply panics if any file system error occurs.
33 | // A more robust approach suited to asynchronous consensus would be
34 | // to log the error then retry in an exponential-backoff loop.
35 | //
36 | func (fs *FileStore) WriteRead(v Value) (rv Value) {
37 |
38 | try := func() (err error) {
39 |
40 | // Serialize the proposed value
41 | buf, err := encoding.EncodeValue(v)
42 | if err != nil {
43 | return err
44 | }
45 |
46 | // Try to write the file, ignoring already-exists errors
47 | name := fmt.Sprintf("ver-%d", v.P.Step)
48 | path := filepath.Join(fs.Path, name)
49 | err = atomic.WriteFileOnce(path, buf, 0666)
50 | if err != nil && !os.IsExist(err) {
51 | return err
52 | }
53 |
54 | // Read back whatever file was successfully written first there
55 | rbuf, err := ioutil.ReadFile(path)
56 | if err != nil {
57 | return err
58 | }
59 |
60 | // Deserialize the value read
61 | rv, err = encoding.DecodeValue(rbuf)
62 | if err != nil {
63 | return err
64 | }
65 |
66 | return nil
67 | }
68 |
69 | backoff.Retry(context.Background(), try)
70 | return rv
71 | }
72 |
73 | // QSCOD calls Committed to inform us that history comh is committed,
74 | // so we can garbage-collect entries before it in the key/value store.
75 | // But this Store does not implement garbage-collection.
76 | //
77 | func (fs *FileStore) Committed(comh Head) {
78 | // do nothing - no garbage collection
79 | }
80 |
--------------------------------------------------------------------------------
/go/model/qscod/fs/simple/store_test.go:
--------------------------------------------------------------------------------
1 | package simple
2 |
3 | import (
4 | "fmt"
5 | "os"
6 | "testing"
7 |
8 | . "github.com/dedis/tlc/go/model/qscod/core"
9 | . "github.com/dedis/tlc/go/model/qscod/core/test"
10 | )
11 |
12 | // Run a consensus test case with the specified parameters.
13 | func testRun(t *testing.T, nfail, nnode, ncli, maxstep, maxpri int) {
14 |
15 | // Create a test key/value store representing each node
16 | kv := make([]Store, nnode)
17 | for i := range kv {
18 | path := fmt.Sprintf("test-store-%d", i)
19 | ss := &FileStore{path}
20 | kv[i] = ss
21 |
22 | // Remove the test directory if one is left-over
23 | // from a previous test run.
24 | os.RemoveAll(path)
25 |
26 | // Create the test directory afresh.
27 | if err := os.Mkdir(path, 0744); err != nil {
28 | t.Fatal(err)
29 | }
30 |
31 | // Clean it up once the test is done.
32 | defer os.RemoveAll(path)
33 | }
34 |
35 | TestRun(t, kv, nfail, ncli, maxstep, maxpri)
36 | }
37 |
38 | func TestSimpleStore(t *testing.T) {
39 | testRun(t, 1, 3, 1, 10, 100) // Standard f=1 case,
40 | testRun(t, 1, 3, 2, 10, 100) // varying number of clients
41 | testRun(t, 1, 3, 10, 3, 100)
42 | testRun(t, 1, 3, 20, 2, 100)
43 | testRun(t, 1, 3, 40, 2, 100)
44 |
45 | testRun(t, 2, 6, 10, 5, 100) // Standard f=2 case
46 | testRun(t, 3, 9, 10, 3, 100) // Standard f=3 case
47 | testRun(t, 4, 12, 10, 2, 100) // Standard f=4 case
48 | testRun(t, 5, 15, 10, 2, 100) // Standard f=10 case
49 | }
50 |
--------------------------------------------------------------------------------
/go/model/qscod/fs/store/store.go:
--------------------------------------------------------------------------------
1 | // Package store provides a file system key/value Store for QSCOD.
2 | // It uses the cas package to implement versioned write-once and read,
3 | // with garbage collection of old versions before the last known commit.
4 | //
5 | package store
6 |
7 | import (
8 | "context"
9 |
10 | "github.com/dedis/tlc/go/lib/backoff"
11 | "github.com/dedis/tlc/go/lib/fs/verst"
12 | . "github.com/dedis/tlc/go/model/qscod/core"
13 | "github.com/dedis/tlc/go/model/qscod/encoding"
14 | )
15 |
16 | // FileStore implements a QSCOD key/value store
17 | // as a directory in a file system.
18 | //
19 | type FileStore struct {
20 | state verst.State
21 | ctx context.Context
22 | bc backoff.Config
23 | }
24 |
25 | // Initialize FileStore to use a directory at a given file system path.
26 | // If create is true, create the designated directory if it doesn't exist.
27 | // If excl is true, fail if the designated directory already exists.
28 | func (fs *FileStore) Init(ctx context.Context, path string, create, excl bool) error {
29 |
30 | fs.ctx = ctx
31 | return fs.state.Init(path, create, excl)
32 | }
33 |
34 | // SetBackoff sets the backoff configuration for handling errors that occur
35 | // while attempting to access the key/value store on the file system.
36 | //
37 | // Since we don't know in general which errors may be transitory
38 | // and which are permanent failures, especially on remote file systems,
39 | // FileStore assumes all errors may be transitory, just reports them,
40 | // and keeps trying the access after a random exponential backoff.
41 | //
42 | func (fs *FileStore) SetReport(bc backoff.Config) {
43 | fs.bc = bc
44 | }
45 |
46 | // Attempt to write the value v to a file associated with time-step step,
47 | // then read back whichever value was successfully written first.
48 | // Implements the qscod.Store interface.
49 | //
50 | func (fs *FileStore) WriteRead(v Value) (rv Value) {
51 |
52 | // Don't try to write version 0; that's a virtual placeholder.
53 | if v.P.Step == 0 {
54 | return v
55 | }
56 |
57 | try := func() (err error) {
58 | rv, err = fs.tryWriteRead(v)
59 | return err
60 | }
61 |
62 | fs.bc.Retry(fs.ctx, try)
63 | return rv
64 | }
65 |
66 | func (fs *FileStore) tryWriteRead(val Value) (Value, error) {
67 | ver := int64(val.P.Step)
68 |
69 | // Serialize the proposed value
70 | valb, err := encoding.EncodeValue(val)
71 | if err != nil {
72 | return Value{}, err
73 | }
74 | vals := string(valb)
75 |
76 | // Try to write it to the versioned store -
77 | // but don't fret if someone else wrote it or if it has expired.
78 | err = fs.state.WriteVersion(ver, vals)
79 | if err != nil && !verst.IsExist(err) && !verst.IsNotExist(err) {
80 | return Value{}, err
81 | }
82 |
83 | // Now read back whatever value was successfully written.
84 | vals, err = fs.state.ReadVersion(ver)
85 | if err != nil && verst.IsNotExist(err) {
86 |
87 | // The requested version has probably been aged out,
88 | // so catch up to the most recent committed Head.
89 | _, vals, err = fs.state.ReadLatest()
90 | }
91 | if err != nil {
92 | return Value{}, err
93 | }
94 |
95 | // Deserialize the value we read
96 | val, err = encoding.DecodeValue([]byte(vals))
97 | if err != nil {
98 | return Value{}, err
99 | }
100 |
101 | // Expire all versions before this latest one
102 | fs.state.Expire(int64(val.P.Step))
103 |
104 | // Return the value v that we read
105 | return val, err
106 | }
107 |
--------------------------------------------------------------------------------
/go/model/qscod/fs/store/store_test.go:
--------------------------------------------------------------------------------
1 | package store
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "os"
7 | "testing"
8 |
9 | . "github.com/dedis/tlc/go/model/qscod/core"
10 | . "github.com/dedis/tlc/go/model/qscod/core/test"
11 | )
12 |
13 | // Run a consensus test case with the specified parameters.
14 | func testRun(t *testing.T, nfail, nnode, ncli, maxstep, maxpri int) {
15 |
16 | // Create a test key/value store representing each node
17 | kv := make([]Store, nnode)
18 | ctx := context.Background()
19 | for i := range kv {
20 | path := fmt.Sprintf("test-store-%d", i)
21 |
22 | // Remove the test directory if one is left-over
23 | // from a previous test run.
24 | os.RemoveAll(path)
25 |
26 | // Create the test directory afresh.
27 | ss := &FileStore{}
28 | if err := ss.Init(ctx, path, true, true); err != nil {
29 | t.Fatal(err)
30 | }
31 | kv[i] = ss
32 |
33 | // Clean it up once the test is done.
34 | defer os.RemoveAll(path)
35 | }
36 |
37 | TestRun(t, kv, nfail, ncli, maxstep, maxpri)
38 | }
39 |
40 | func TestSimpleStore(t *testing.T) {
41 | testRun(t, 1, 3, 1, 10, 100) // Standard f=1 case,
42 | testRun(t, 1, 3, 2, 10, 100) // varying number of clients
43 | testRun(t, 1, 3, 10, 3, 100)
44 | testRun(t, 1, 3, 20, 2, 100)
45 | testRun(t, 1, 3, 40, 2, 100)
46 |
47 | testRun(t, 2, 6, 10, 5, 100) // Standard f=2 case
48 | testRun(t, 3, 9, 10, 3, 100) // Standard f=3 case
49 |
50 | // Note: when nnode * ncli gets to be around 120-ish,
51 | // we start running into default max-open-file limits.
52 | }
53 |
--------------------------------------------------------------------------------
/go/model/qscod/qscas/doc.go:
--------------------------------------------------------------------------------
1 | // Package qscas provides an implementation of QSCOD consensus
2 | // that both builds on, and provides, a Check-and-Set (CAS) Store interface
3 | // as defined by the tlc/go/lib/cas package.
4 | //
5 | package qscas
6 |
--------------------------------------------------------------------------------
/go/model/qscod/qscas/group.go:
--------------------------------------------------------------------------------
1 | package qscas
2 |
3 | import (
4 | "context"
5 | "sync"
6 |
7 | "github.com/dedis/tlc/go/lib/cas"
8 | "github.com/dedis/tlc/go/model/qscod/core"
9 | )
10 |
11 | // Group implements the cas.Store interface as a QSCOD consensus group.
12 | // After creation, invoke Start to configure the consensus group state,
13 | // then call CompareAndSet to perform CAS operations on the logical state.
14 | type Group struct {
15 | c core.Client // consensus client core
16 | ctx context.Context // group operation context
17 |
18 | mut sync.Mutex // for synchronizing shutdown
19 | wg sync.WaitGroup // counts active CAS operations
20 | done bool // set after group shutdown
21 |
22 | // channel that CAS calls use to propose work to do
23 | ch chan func(int64, string, bool) (string, int64)
24 | }
25 |
26 | // Start initializes g to represent a consensus group comprised of
27 | // particular member nodes, starts it operating, and returns g.
28 | //
29 | // Consensus thresholds are determined by the faulty parameter,
30 | // the maximum number of faulty nodes the group should tolerate.
31 | // For this implementation of QSCOD based on the TLCB and TLCR algorithms,
32 | // faulty should be at most one-third of the total group size.
33 | // If faulty < 0, it is set to one-third of the group size, rounded down.
34 | //
35 | // Start launchers worker goroutines that help service CAS requests,
36 | // which will run and consume resources forever unless cancelled.
37 | // To define their lifetime, the caller should pass a cancelable context,
38 | // and cancel it when operations on the Group are no longer required.
39 | //
40 | func (g *Group) Start(ctx context.Context, members []cas.Store, faulty int) *Group {
41 |
42 | // Calculate and sanity-check the threshold configuration parameters.
43 | // For details on where these calculations come from, see:
44 | // https://arxiv.org/abs/2003.02291
45 | N := len(members)
46 | if faulty < 0 {
47 | faulty = N / 3 // Default fault tolerance threshold
48 | }
49 | Tr := N - faulty // receive threshold
50 | Ts := N - Tr + 1 // spread threshold
51 | if Tr <= 0 || Tr > N || Ts <= 0 || Ts > Tr || (Ts+Tr) <= N {
52 | panic("faulty threshold yields unsafe configuration")
53 | }
54 | if N*(Tr-Ts+1)-Tr*(N-Tr) <= 0 { // test if Tb <= 0
55 | panic("faulty threshold yields non-live configuration")
56 | }
57 | //println("N", N, "Tr", Tr, "Ts", Ts)
58 |
59 | // Create a consensus group state instance
60 | g.c = core.Client{Tr: Tr, Ts: Ts}
61 | g.ctx = ctx
62 | g.ch = make(chan func(s int64, p string, c bool) (string, int64))
63 |
64 | // Create a core.Store wrapper around each cas.Store group member
65 | g.c.KV = make([]core.Store, N)
66 | for i := range members {
67 | g.c.KV[i] = &coreStore{Store: members[i], g: g}
68 | }
69 |
70 | // Our proposal function normally just "punts" by waiting for
71 | // an actual proposal to get sent on the group's channel,
72 | // and then we call that to form the proposal as appropriate.
73 | // But we concurrently listen for channel cancellation
74 | // and return promptly with a no-op proposal in that case.
75 | g.c.Pr = func(s int64, p string, c bool) (prop string, pri int64) {
76 | for {
77 | select {
78 | case f := <-g.ch: // got a CAS work function to call
79 | if f == nil { // context cancelled
80 | println("Pr: channel closed")
81 | return p, 0 // no-op proposal
82 | }
83 | //println("got work function\n")
84 | prop, pri = f(s, p, c) // call work function
85 | if prop != "" || pri != 0 {
86 | return prop, pri // return its result
87 | }
88 | //println("work function yielded no work")
89 |
90 | case <-ctx.Done(): // our context got cancelled
91 | //println("Pr: cancelled")
92 | return p, 0 // produce no-op proposal
93 | }
94 | }
95 | }
96 |
97 | // Launch the underlying consensus core as a separate goroutine.
98 | // Make sure the group's WaitGroup remains nonzero until
99 | // the context is cancelled and we're ready to shut down.
100 | g.wg.Add(1)
101 | go g.run(ctx)
102 |
103 | return g
104 | }
105 |
106 | // Run consensus in a goroutine
107 | func (g *Group) run(ctx context.Context) {
108 |
109 | // Run the consensus protocol until our context gets cancelled
110 | g.c.Run(ctx)
111 |
112 | // Drain any remaining proposal function sends to the group's channel.
113 | // CompareAndSet won't add anymore after g.ctx has been cancelled.
114 | go func() {
115 | for range g.ch {
116 | }
117 | }()
118 |
119 | g.mut.Lock()
120 |
121 | // Wait until no threads are in active CompareAndSet calls.
122 | g.wg.Done()
123 | g.wg.Wait()
124 |
125 | // Now it's safe to close the group's channel.
126 | close(g.ch)
127 | g.done = true
128 |
129 | g.mut.Unlock()
130 | }
131 |
132 | // CompareAndSet conditionally writes a new version and reads the latest,
133 | // implementing the cas.Store interface.
134 | //
135 | func (g *Group) CompareAndSet(ctx context.Context, old, new string) (
136 | version int64, actual string, err error) {
137 |
138 | //println("CAS lastVer", lastVer, "reqVal", reqVal)
139 |
140 | // Record active CompareAndSet calls in a WaitGroup
141 | // so that the group's main goroutine can wait for them to complete
142 | // when shutting down gracefully in response to context cancellation.
143 | // Atomically check that the group is still active before wg.Add.
144 | g.mut.Lock()
145 | if g.done {
146 | //println("CAS after done")
147 | // This should only ever happen once the context is cancelled
148 | if g.ctx.Err() == nil {
149 | panic("group done but context not cancelled?")
150 | }
151 | g.mut.Unlock()
152 | return 0, "", g.ctx.Err()
153 | }
154 | g.wg.Add(1)
155 | g.mut.Unlock()
156 | defer g.wg.Done()
157 |
158 | // We'll need a mutex to protect concurrent accesses to our locals.
159 | mut := sync.Mutex{}
160 |
161 | // Define the proposal formulation function that will do our work.
162 | // Returns the empty string to keep this worker thread waiting
163 | // for something to propose while letting other threads progress.
164 | pr := func(s int64, cur string, com bool) (prop string, pri int64) {
165 | mut.Lock()
166 | defer mut.Unlock()
167 |
168 | //println("CAS step", s, cur, com, "prop", old, "->", new)
169 |
170 | // Now check the situation of what's known to be committed.
171 | switch {
172 |
173 | // It's safe to propose new as the new string to commit
174 | // if the prior value we're building on is equal to old.
175 | case cur == old:
176 | prop, pri = new, randValue()
177 |
178 | // Complete the CAS operation as soon as we commit anything,
179 | // whether it was our new proposal or some other string.
180 | case com:
181 | version, actual = int64(s), cur
182 |
183 | // Otherwise, if the current proposal isn't the same as old
184 | // but also isn't committed, we have to make no-op proposals
185 | // until we manage to get something committed.
186 | default:
187 | println("no-op proposal")
188 | prop, pri = cur, randValue()
189 |
190 | //case int64(s) > lastVer && c && p != prop:
191 | // err = cas.Changed
192 | // fallthrough
193 | //// XXX get rid of Changed?
194 |
195 | //case int64(s) > lastVer && c:
196 | // actualVer = int64(s)
197 | // actualVal = reqVal
198 |
199 | //case int64(s) > lastVer:
200 | // // do nothing
201 |
202 | // Our CAS has succeeded if we've committed a new version
203 | // that builds immediately on the version we were expecting
204 | // and that commits the reqVal we were trying to propose.
205 | // Return "" in prop to have this worker thread keep waiting
206 | // for a future CAS operation to propose something useful.
207 | // case int64(L.Step) == lastVer && C.Data == reqVal:
208 | // println("proposal committed at step", C.Step)
209 | // if int64(C.Step) <= lastVer {
210 | // panic("XXX")
211 | // }
212 | // actualVer = int64(s)
213 | // actualVal = reqVal
214 |
215 | // Otherwise, our CAS fails with a Changed error as soon as
216 | // anything else gets committed on top of lastVer.
217 | // Return "" in prop to keep this worker thread waiting.
218 | // case int64(C.Step) > lastVer:
219 | // println("proposal overridden at step", C.Step)
220 | // actualVer = int64(C.Step)
221 | // actualVal = C.Data
222 | // err = cas.Changed
223 |
224 | // If C.Step < lastVer, we're choosing a proposal for a node
225 | // that doesn't yet "know" that lastVer was committed.
226 | // Just return a "safe" no-op proposal for this node,
227 | // although we know it has no chance of being committed.
228 | // case int64(C.Step) < lastVer:
229 | // println(i, "outdated at", C.Step, "<", lastVer,
230 | // "data", C.Data)
231 | // prop, pri = C.Data, 0
232 |
233 | //default:
234 | // panic("lastVer appears to be from the future")
235 | }
236 | return
237 | }
238 |
239 | // A simple helper function to test if we've completed our work.
240 | done := func() bool {
241 | mut.Lock()
242 | defer mut.Unlock()
243 | return actual != "" || err != nil
244 | }
245 |
246 | // Continuously send references to our proposal function
247 | // to the group's channel so it will get called until it finishes
248 | // or until one of the contexts (ours or the group's) is cancelled.
249 | // Since the channel is unbuffered, each send will block
250 | // until some consensus worker thread is ready to receive it.
251 | for !done() && ctx.Err() == nil && g.ctx.Err() == nil {
252 | //println("CAS sending", old, "->", new)
253 | g.ch <- pr
254 | }
255 | // println("CAS done", lastVer, "reqVal", reqVal,
256 | // "actualVer", actualVer, "actualVal", actualVal, "err", err)
257 | return version, actual, err
258 | }
259 |
--------------------------------------------------------------------------------
/go/model/qscod/qscas/group_test.go:
--------------------------------------------------------------------------------
1 | package qscas
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "testing"
7 |
8 | "github.com/dedis/tlc/go/lib/cas"
9 | "github.com/dedis/tlc/go/lib/cas/test"
10 | )
11 |
12 | // Run a consensus test case with the specified parameters.
13 | func testRun(t *testing.T, nfail, nnode, nclients, nthreads, naccesses int) {
14 |
15 | desc := fmt.Sprintf("F=%v,N=%v,Clients=%v,Threads=%v,Accesses=%v",
16 | nfail, nnode, nclients, nthreads, naccesses)
17 | t.Run(desc, func(t *testing.T) {
18 |
19 | // Create a cancelable context for the test run
20 | ctx, cancel := context.WithCancel(context.Background())
21 |
22 | // Create an in-memory CAS register representing each node
23 | members := make([]cas.Store, nnode)
24 | memhist := make([]test.History, nnode)
25 | for i := range members {
26 | members[i] = &cas.Register{}
27 | }
28 |
29 | // Create a consensus group Store for each simulated client
30 | clients := make([]cas.Store, nclients)
31 | for i := range clients {
32 |
33 | // Interpose checking wrappers on the CAS registers
34 | checkers := make([]cas.Store, nnode)
35 | for i := range checkers {
36 | checkers[i] = test.Checked(t, &memhist[i],
37 | members[i])
38 | }
39 |
40 | clients[i] = (&Group{}).Start(ctx, checkers, nfail)
41 | }
42 |
43 | // Run a standard torture-test across all the clients
44 | test.Stores(t, nthreads, naccesses, clients...)
45 |
46 | // Shut down all the clients by canceling the context
47 | cancel()
48 | })
49 | }
50 |
51 | // Test the Client with a trivial in-memory key/value Store implementation.
52 | func TestClient(t *testing.T) {
53 | testRun(t, 1, 3, 1, 1, 1000) // Standard f=1 case
54 | testRun(t, 1, 3, 2, 1, 1000)
55 | testRun(t, 1, 3, 10, 1, 1000)
56 | testRun(t, 1, 3, 20, 1, 100)
57 | testRun(t, 1, 3, 50, 1, 10)
58 | testRun(t, 1, 3, 100, 1, 10)
59 |
60 | testRun(t, 2, 6, 10, 10, 000) // Standard f=2 case
61 | testRun(t, 3, 9, 10, 10, 100) // Standard f=3 case
62 | testRun(t, 4, 12, 10, 10, 100) // Standard f=4 case
63 | testRun(t, 5, 15, 10, 10, 100) // Standard f=10 case
64 |
65 | // Test with low-entropy tickets: hurts commit rate, but still safe!
66 | testRun(t, 1, 3, 10, 10, 1000) // Extreme low-entropy: rarely commits
67 | testRun(t, 1, 3, 10, 10, 1000) // A bit better bit still bad...
68 | }
69 |
--------------------------------------------------------------------------------
/go/model/qscod/qscas/rand.go:
--------------------------------------------------------------------------------
1 | package qscas
2 |
3 | import (
4 | "crypto/rand"
5 | "encoding/binary"
6 | )
7 |
8 | // Generate a 63-bit positive integer from strong cryptographic randomness.
9 | func randValue() int64 {
10 | var b [8]byte
11 | _, err := rand.Read(b[:])
12 | if err != nil {
13 | panic("error reading cryptographic randomness: " + err.Error())
14 | }
15 | return int64(binary.BigEndian.Uint64(b[:]) &^ (1 << 63))
16 | }
17 |
--------------------------------------------------------------------------------
/go/model/qscod/qscas/store.go:
--------------------------------------------------------------------------------
1 | package qscas
2 |
3 | import (
4 | "github.com/dedis/tlc/go/lib/backoff"
5 | "github.com/dedis/tlc/go/lib/cas"
6 | "github.com/dedis/tlc/go/model/qscod/core"
7 | "github.com/dedis/tlc/go/model/qscod/encoding"
8 | )
9 |
10 | // coreStore implements QSCOD core's native Store interface
11 | // based on a cas.Store interface.
12 | type coreStore struct {
13 | cas.Store // underlying CAS state store
14 | g *Group // group this store is associated with
15 | lvals string // last value we observed in the underlying Store
16 | lval core.Value // deserialized last value
17 | }
18 |
19 | func (cs *coreStore) WriteRead(v core.Value) (rv core.Value) {
20 |
21 | try := func() (err error) {
22 | rv, err = cs.tryWriteRead(v)
23 | return err
24 | }
25 |
26 | // Try to perform the atomic operation until it succeeds
27 | // or until the group's context gets cancelled.
28 | err := backoff.Retry(cs.g.ctx, try)
29 | if err != nil && cs.g.ctx.Err() != nil {
30 |
31 | // The group's context got cancelled,
32 | // so just silently return nil Values
33 | // until the consensus worker threads catch up and terminate.
34 | //println("WriteRead cancelled")
35 | return core.Value{}
36 | }
37 | if err != nil {
38 | panic("backoff.Retry inexplicably gave up: " + err.Error())
39 | }
40 | return rv
41 | }
42 |
43 | func (cs *coreStore) tryWriteRead(val core.Value) (core.Value, error) {
44 |
45 | // Serialize the proposed value
46 | valb, err := encoding.EncodeValue(val)
47 | if err != nil {
48 | println("encoding error", err.Error())
49 | return core.Value{}, err
50 | }
51 | vals := string(valb)
52 |
53 | // Try to set the underlying CAS register to the proposed value
54 | // only as long as doing so would strictly increase its TLC step
55 | for val.S > cs.lval.S {
56 |
57 | // Write the serialized value to the underlying CAS interface
58 | _, avals, err := cs.CompareAndSet(cs.g.ctx, cs.lvals, vals)
59 | if err != nil {
60 | println("CompareAndSet error", err.Error())
61 | return core.Value{}, err
62 | }
63 |
64 | // Deserialize the actual value we read back
65 | aval, err := encoding.DecodeValue([]byte(avals))
66 | if err != nil {
67 | println("decoding error", err.Error())
68 | return core.Value{}, err
69 | }
70 |
71 | // println("tryWriteRead step",
72 | // cs.lval.S, "w", val.S, "->", aval.S,
73 | // "casver", cs.lver, "->", aver)
74 |
75 | if aval.S <= cs.lval.S {
76 | panic("CAS failed to advance TLC step!")
77 | }
78 |
79 | // Update our record of the underlying CAS version and value
80 | //println("update from step", cs.lval.S, "to step", aval.S)
81 | cs.lvals, cs.lval = avals, aval
82 | }
83 |
84 | //println("cs returning newer step", cs.lval.S)
85 | return cs.lval, nil
86 | }
87 |
--------------------------------------------------------------------------------
/go/model/quepaxa/consensus.go:
--------------------------------------------------------------------------------
1 | package quepaxa
2 |
3 | import (
4 | "context"
5 | "sync"
6 | )
7 |
8 | type Node int32
9 | type Choice int64
10 | type Step int32
11 |
12 | // A logical time consists of
13 | // a Choice (consensus decision or slot number) and
14 | // a Step (consensus attempt number within a turn).
15 | type Time struct {
16 | c Choice
17 | s Step
18 | }
19 |
20 | // Returns true if logical time T1 is strictly less than T2.
21 | func (t1 Time) LT(t2 Time) bool {
22 | return t1.c < t2.c || (t1.c == t2.c && t1.s < t2.s)
23 | }
24 |
25 | type Replica[P Proposal[P]] interface {
26 | Record(ctx context.Context, t Time, p P) (
27 | rt Time, rf P, rl P, err error)
28 | }
29 |
30 | type Proposer[P Proposal[P]] struct {
31 |
32 | // configuration state
33 | w []worker[P] // one worker per replica
34 | th int // consensus threshold (n-f)
35 |
36 | // synchronization state
37 | m sync.Mutex
38 | c sync.Cond
39 |
40 | t Time // proposer's current logical time
41 |
42 | // per-choice state
43 | ld Node // which replica is the leader, -1 if none
44 | dp P // decision proposal from last choice
45 | nf int // number of fast-path responses this choice
46 |
47 | // per-step state
48 | pp P // preferred proposal for this step
49 | bp P // best of appropriate replies this step
50 | nr int // number of responses seen so far this step
51 |
52 | // graceful termination state
53 | stop bool // signal when workers should shut down
54 | ctx context.Context // cancelable context for all of our workers
55 | cancel context.CancelFunc // cancellation function
56 | }
57 |
58 | func (p *Proposer[P]) Init(replicas []Replica[P]) {
59 |
60 | if p.w != nil {
61 | panic("Proposer.Init must not be invoked twice")
62 | }
63 |
64 | // set up a cancelable context for when we want to stop
65 | p.ctx, p.cancel = context.WithCancel(context.Background())
66 |
67 | // set the threshold appropriately for group size
68 | p.th = len(replicas)/2 + 1
69 |
70 | p.w = make([]worker[P], len(replicas))
71 | for i := range replicas {
72 | p.w[i].p = p
73 | p.w[i].r = replicas[i]
74 | p.w[i].i = Node(i)
75 |
76 | go p.w[i].work()
77 | }
78 | }
79 |
80 | func (p *Proposer[P]) Agree(preferred P) (choice Choice, decision P) {
81 |
82 | // keep our mutex locked except while waiting on a condition
83 | p.m.Lock()
84 | defer p.m.Unlock()
85 |
86 | c := p.t.c
87 | if p.t.s < 4 {
88 | p.advance(Time{p.t.c, 4}, preferred)
89 | }
90 | for !p.stop && p.t.c == c {
91 | // signal any non-busy workers that there's new work to do
92 | p.c.Broadcast()
93 | }
94 |
95 | // return choice at which last decision was made, and that decision
96 | return p.t.c - 1, p.dp
97 | }
98 |
99 | // Advance to time t with preferred proposap pp.
100 | // Proposer's mutex must be locked.
101 | func (p *Proposer[P]) advance(t Time, pp P) {
102 | p.t = t // new time step
103 | p.pp = pp // preferred proposal entering new step
104 | p.bp = pp.Nil() // initial best proposal from new step
105 | p.nr = 0 // count responses toward threshold
106 |
107 | if t.s == 4 { // only when advancing to fast-path step...
108 | p.nf = 0 // initialize fast-path response count
109 | }
110 | }
111 |
112 | // Each worker thread calls workDone when it gets a response from a recorder.
113 | //
114 | // This function gets called at most once per recorder per time step,
115 | // so it can count responses without worrying about duplicates.
116 | func (p *Proposer[P]) workDone(rt Time, rf, rl P) {
117 |
118 | // When we receive fast-path responses from phase 4 of current choice,
119 | // count them towards the fast-path threshold even if they come late.
120 | if rt.c == p.t.c && rt.s == 4 {
121 | p.nf++
122 | if p.nf == p.th {
123 | p.decided(rf) // fast-path decision
124 | }
125 | }
126 |
127 | // where is the proposer with respect to the response in logical time?
128 | if rt.LT(p.t) { // is the response behind the proposer?
129 | return // the work done is obsolete - just discard
130 | }
131 | if p.t.LT(rt) { // is the response ahead of the proposer?
132 | p.advance(rt, rf) // advance to newer time in response
133 | return
134 | }
135 | // the response is from proposer's current time step exactly
136 |
137 | // what we do with the response depends on which phase we're in
138 | if rt.s&3 == 0 {
139 | p.bp = p.bp.Best(rf) // Phase 0: best of first proposals
140 | } else if rt.s&2 != 0 {
141 | p.bp = p.bp.Best(rl) // Phase 2-3: best of last aggregate
142 | }
143 |
144 | // have we reached the response threshold for this step?
145 | p.nr++
146 | if p.nr < p.th {
147 | return // not yet, wait for more responses
148 | }
149 | // threshold reached, so we can complete this time step
150 |
151 | // in phase 2, check if we've reached a consensus decision
152 | if rt.s&3 == 2 && p.pp.EqD(p.bp) {
153 | p.decided(p.pp)
154 | return
155 | }
156 | // no decision yet but still end of current time step
157 |
158 | // in phases 0 and 3, new preferred proposal is best from replies
159 | pp := p.pp
160 | if rt.s&3 == 0 || rt.s&3 == 3 {
161 | pp = p.bp
162 | }
163 |
164 | // advance to next logical time step
165 | p.advance(Time{p.t.c, p.t.s + 1}, pp)
166 | }
167 |
168 | func (p *Proposer[P]) decided(dp P) {
169 |
170 | // record the decision in local state
171 | p.t.c++ // last choice is decided, now on to next
172 | p.t.s = 0 // idle but ready for a new agreement
173 | p.dp = dp // record decision proposal from last choice
174 | p.ld = -1 // default to no leader, but caller can change
175 |
176 | // signal the main proposer thread to return the decision,
177 | // while the workers inform the recorders asynchronously.
178 | p.c.Broadcast()
179 | }
180 |
181 | // Immediately after observing a decision being made,
182 | // the application can select a new leader based on that decision.
183 | // If SetLeader is not called, the next choice is leaderless.
184 | // The choice of leader (or lack thereof) must be deterministic
185 | // based on prior decisions and set the same on all nodes.
186 | func (p *Proposer[P]) SetLeader(leader Node) {
187 | p.ld = leader
188 | }
189 |
190 | // Stop permanently shuts down this proposer and its worker threads.
191 | func (p *Proposer[P]) Stop() {
192 |
193 | p.stop = true // signal that workers should stop
194 | p.c.Broadcast() // wake them up to see the signal
195 | p.cancel() // also cancel all Record calls in progress
196 | }
197 |
198 | // We create one worker per replica.
199 | type worker[P Proposal[P]] struct {
200 | p *Proposer[P] // back pointer to Proposer
201 | r Replica[P] // Replica interface of this replica
202 | i Node // replica number of this replica
203 | }
204 |
205 | func (w *worker[P]) work() {
206 | p := w.p // keep handy pointer back to proposer
207 | p.m.Lock()
208 | for !p.stop {
209 | // we're done with prior steps so wait until proposer advances
210 | t := p.t // save proposer's current time
211 | for p.t == t {
212 | p.c.Wait()
213 | }
214 |
215 | pp := p.pp // save proposer's preferred proposal
216 | if t.s&3 == 0 { // in phase zero we must re-rank proposals
217 | pp = pp.Rank(w.i, t.s == 4 && w.i == p.ld)
218 | }
219 |
220 | // asychronously record the proposal with mutex unlocked
221 | p.m.Unlock()
222 | rt, rf, rl, err := w.r.Record(p.ctx, t, pp)
223 | if err != nil { // canceled
224 | return
225 | // XXX backoff retry?
226 | }
227 | p.m.Lock()
228 |
229 | // inform the Proposer that this recorder's work is done
230 | p.workDone(rt, rf, rl)
231 | }
232 | p.m.Unlock()
233 | }
234 |
--------------------------------------------------------------------------------
/go/model/quepaxa/isr.go:
--------------------------------------------------------------------------------
1 | package quepaxa
2 |
3 | // Interval Summary Register (ISR)
4 | type ISR[P Proposal[P]] struct {
5 | t Time // current logical time step
6 | f P // first value seen in this step
7 | a P // aggregated values so far in this step
8 | l P // aggregated values seen in last step
9 | }
10 |
11 | func (r *ISR[P]) Record(t Time, p P) (Time, P, P) {
12 |
13 | if r.t.LT(t) {
14 | // Our recorder state needs to catch up to time t
15 | if t.s == r.t.s+1 {
16 | r.l = r.a
17 | } else {
18 | r.l = r.l.Nil()
19 | }
20 | r.t = t
21 | r.f = p
22 | r.a = p
23 |
24 | } else if !t.LT(r.t) {
25 |
26 | // At exactly the right time step - just aggregate proposals
27 | r.a = r.a.Best(p)
28 |
29 | } else {
30 | // proposal p is obsolete - just discard it
31 | }
32 |
33 | // In any case, return the latest recorder state
34 | return r.t, r.f, r.l
35 | }
36 |
--------------------------------------------------------------------------------
/go/model/quepaxa/proposal.go:
--------------------------------------------------------------------------------
1 | package quepaxa
2 |
3 | import (
4 | "crypto/rand"
5 | "encoding/binary"
6 | "math"
7 | )
8 |
9 | // The Proposal interface defines constraints for a concrete proposal type P.
10 | //
11 | // The Rank method must return the same proposal with rank set appropriately:
12 | // - to the maximum rank High if leader is set (this replica is the leader)
13 | // - to a freshly-chosen random rank between 1 and High-1 otherwise
14 | //
15 | // In addition, if proposal ranks are low-entropy so there is a chance of ties,
16 | // and P is using replica numbers for tiebreaking,
17 | // then the Rank function also sets the replica number in the proposal.
18 | type Proposal[P any] interface {
19 | Nil() P // the nil proposal
20 | Best(other P) P // best of this and other
21 | Rank(replica Node, leader bool) P // randomly rank proposal
22 | EqD(other P) bool // equality for deciding
23 | }
24 |
25 | // BasicProposal provides a basic proposal design
26 | // that represents a reasonable "sweet spot" for most purposes.
27 | //
28 | // Proposals are randomly ranked using 31 bits of private randomness,
29 | // drawn from the cryptographic random source for strong unpredictability,
30 | // which might conceivably be needed to protect against a strong DoS attacker.
31 | // Since 31-bit random ranks do not have high entropy,
32 | // BasicProposal uses recorder numbers for breaking ties.
33 | //
34 | // BasicProposal contains a field D of parameterized type Data,
35 | // containing any application-defined data associated with the proposal.
36 | // This type may contain pointers or slices (e.g., referring to bulk data)
37 | // provided the referenced data objects do not change during consensus.
38 | // The BasicProposal does nothing with this data field other than copy it.
39 | type BasicProposal[Data any] struct {
40 | R uint32 // Randomzed rank or priority
41 | N Node // Replica for which proposal was created
42 | D Data // Application-defined data
43 | }
44 |
45 | const basicProposalHighRank = math.MaxUint32
46 |
47 | func (_ BasicProposal[D]) Nil() BasicProposal[D] {
48 | return BasicProposal[D]{}
49 | }
50 |
51 | func (p BasicProposal[D]) Best(o BasicProposal[D]) BasicProposal[D] {
52 | if o.R > p.R || (o.R == p.R && o.N > p.N) {
53 | return o
54 | }
55 | return p
56 | }
57 |
58 | func (p BasicProposal[D]) Rank(node Node, leader bool) BasicProposal[D] {
59 |
60 | // record the replica number that this proposal was produced for
61 | p.N = node
62 |
63 | if leader {
64 | // the leader always uses the reserved maximum rank
65 | p.R = basicProposalHighRank
66 |
67 | } else {
68 | // read 32 bits of randomness
69 | var b [4]byte
70 | _, err := rand.Read(b[:])
71 | if err != nil {
72 | panic("unable to read cryptographically random bits: " +
73 | err.Error())
74 | }
75 |
76 | // produce a 31-bit rank, avoiding the zero rank
77 | p.R = (binary.BigEndian.Uint32(b[:]) & 0x7fffffff) + 1
78 | }
79 | return p
80 | }
81 |
82 | func (p BasicProposal[D]) EqD(o BasicProposal[D]) bool {
83 | return p.R == o.R && p.N == o.N
84 | }
85 |
86 | var bp BasicProposal[struct{}]
87 | var prop Proposer[BasicProposal[struct{}]]
88 |
--------------------------------------------------------------------------------
/go/model/tlc.go:
--------------------------------------------------------------------------------
1 | package model
2 |
3 | // Create a copy of our message template for transmission.
4 | // Sends QSC state only for the rounds still in our window.
5 | func (n *Node) newMsg() *Message {
6 | msg := n.m // copy template
7 | msg.QSC = append([]Round{}, n.m.QSC[n.m.Step:]...) // active QSC state
8 | return &msg
9 | }
10 |
11 | // Broadcast a copy of our current message template to all nodes
12 | func (n *Node) broadcastTLC() {
13 | msg := n.newMsg()
14 | for i := 0; i < n.nnode; i++ {
15 | n.send(i, msg)
16 | }
17 | }
18 |
19 | // Advance to the next TLC time step.
20 | //
21 | // The client must invoke this function once after calling NewNode
22 | // to launch the protocol and broadcast the message for TLC time-step zero.
23 | // Thereafter, TLC advances time automatically based on network communication.
24 | //
25 | func (n *Node) Advance() {
26 |
27 | // Initialize message template with a proposal for the new time step
28 | n.m.Step++ // Advance to next time step
29 | n.m.Type = Raw // Broadcast raw proposal first
30 | n.acks = 0 // No acknowledgments received yet in this step
31 | n.wits = 0 // No threshold witnessed messages received yet
32 |
33 | // Notify the upper (QSC) layer of the advancement of time,
34 | // and let it fill in its part of the new message to broadcast.
35 | n.advanceQSC()
36 |
37 | n.broadcastTLC() // broadcast our raw proposal
38 | }
39 |
40 | // Receive is called by the client or network layer on receipt of a Message
41 | // from a peer.
42 | // Any unmarshaling that may be required must have already been done.
43 | //
44 | // This function assumes that peer-to-peer connections are ordered and reliable,
45 | // as they are when sent over Go channels or TCP/TLS connections.
46 | // It also assumes that connection or peer failures are permanent:
47 | // this implementation of QSC does not support restarting/resuming connections.
48 | //
49 | func (n *Node) Receive(msg *Message) {
50 |
51 | // Process only messages from the current or next time step.
52 | // We could accept and merge in information from older messages,
53 | // but it's perfectly safe and simpler just to ignore old messages.
54 | if msg.Step >= n.m.Step {
55 |
56 | // If msg is ahead of us, then virally catch up to it
57 | // Since we receive messages from a given peer in order,
58 | // a message we receive can be at most one step ahead of ours.
59 | if msg.Step > n.m.Step {
60 | n.Advance()
61 | }
62 |
63 | // Merge in received QSC state for rounds still in our pipeline
64 | mergeQSC(n.m.QSC[msg.Step:], msg.QSC)
65 |
66 | // Now process this message according to type.
67 | switch msg.Type {
68 | case Raw: // Acknowledge unwitnessed proposals.
69 | ack := n.newMsg()
70 | ack.Type = Ack
71 | n.send(msg.From, ack)
72 |
73 | case Ack: // Collect a threshold of acknowledgments.
74 | n.acks++
75 | if n.m.Type == Raw && n.acks >= n.thres {
76 | n.m.Type = Wit // Prop now threshold witnessed
77 | n.witnessedQSC()
78 | n.broadcastTLC()
79 | }
80 |
81 | case Wit: // Collect a threshold of threshold witnessed messages
82 | n.wits++ // witnessed messages in this step
83 | if n.wits >= n.thres {
84 | n.Advance() // tick the clock
85 | }
86 | }
87 | }
88 | }
89 |
--------------------------------------------------------------------------------
/spin/README.md:
--------------------------------------------------------------------------------
1 | This directory contains a minimal model of
2 | Que Sera Consensus (QSC) and Threshold Logical Clocks (TLC)
3 | for the [Spin model checker](https://spinroot.com/spin/whatispin.html).
4 | To test it, simply use the provided `run.sh` script after installing Spin.
5 |
6 | For background information on QSC and TLC,
7 | and other model implementations in several languages, please see the
8 | [top level of this repository](https://github.com/dedis/tlc/).
9 |
--------------------------------------------------------------------------------
/spin/qp.pml:
--------------------------------------------------------------------------------
1 | // Simple model of QuePaxa consensus.
2 | // Recorder logic runs atomically in-line within the proposer code.
3 |
4 | #define N 3 // total number of recorder (state) nodes
5 | #define F 1 // number of failures tolerated
6 | #define T (N-F) // consensus threshold required
7 |
8 | #define M 2 // number of proposers (clients)
9 |
10 | #define STEPHI 11 // highest step number to simulate
11 | #define RAND 2 // random part of fitness space is 1..RAND
12 | #define HI (RAND+1) // top priority for proposals by leader
13 | #define VALS 2 // space of preferred values is 1..VALS
14 |
15 | // A proposal is an integer divided into two bit-fields: fitness and value.
16 | #define VALBITS 4
17 | #define FITBITS 4
18 | #define VALSHIFT (0)
19 | #define FITSHIFT (VALBITS)
20 | #define PROP(f,v) (((f)<> VALSHIFT) & ((1 << VALBITS)-1))
22 | #define FIT(p) (((p) >> FITSHIFT) & ((1 << FITBITS)-1))
23 |
24 | #define MAX(a, b) ((a) > (b) -> (a) : (b))
25 |
26 | // Recorder state: implements an interval summary register (ISR),
27 | // which returns the first value submitted in this time step
28 | // and the maximum of all values submitted in the prior time step
29 | typedef Rec {
30 | byte s; // step number
31 | byte f; // first value submitted in this step
32 | byte a; // maximum value seen so far in this step
33 | byte m; // maximum value seen in prior step (s-1)
34 | }
35 |
36 | Rec rec[1+N]; // state of recorder nodes 1..N
37 | byte decided; // proposed value that we've decided on
38 | byte leader; // which proposer is the well-known leader
39 |
40 | #define DECIDE(j, s, p) atomic { \
41 | printf("%d step %d decided <%d,%d>", j, s, FIT(p), VAL(p)); \
42 | assert(decided == 0 || decided == VAL(p)); \
43 | decided = VAL(p); \
44 | }
45 |
46 |
47 | // We model one process per proposer.
48 | proctype Proposer(byte j) { // We're proposer j in 1..M
49 | byte s, t;
50 | byte p, g;
51 | byte i, recs, mask; // recorders we've interacted with
52 | bit done; // for detecting early-decision opportunities
53 |
54 | // Choose the arbitrary initial "preferred value" of this proposer
55 | s = 4;
56 | select (t : 1 .. VALS); // select a "random" value into temporary
57 | p = PROP(HI, t);
58 | printf("%d proposing %d\n", j, t);
59 |
60 | do // iterate over time-steps
61 | :: s <= STEPHI ->
62 | printf("%d step %d\n", j, s);
63 |
64 | // Send and get reply from threshold of recorders
65 | recs = 0; // number of recorders we've heard from
66 | mask = 0; // bit mask of those recorders
67 | g = 0; // gather best response proposer saw so far
68 | done = true;
69 | select (i : 1 .. N); // first recorder to interact with
70 | do // interact with the recorders in any order
71 | :: recs < T && (mask & (1 << i)) == 0 ->
72 |
73 | atomic {
74 | // Randomize fitnesses if we're not the leader
75 | if
76 | :: (s & 3) == 0 && j != leader ->
77 | select(t : 1 .. RAND);
78 | p = PROP(t, VAL(p));
79 | :: else -> skip
80 | fi
81 | assert(FIT(p) > 0 && VAL(p) > 0);
82 |
83 | // enter the recorder/ISR role (via "RPC").
84 | printf("%d step %d ISR <%d,%d> to %d\n",
85 | j, s, FIT(p), VAL(p), i);
86 |
87 | // first catch up the recorder if appropriate
88 | if
89 | :: s > rec[i].s ->
90 | rec[i].m = ((s == (rec[i].s+1)) ->
91 | rec[i].a : 0);
92 | rec[i].s = s;
93 | rec[i].f = p;
94 | rec[i].a = p;
95 |
96 | :: s == rec[i].s ->
97 | rec[i].a = MAX(rec[i].a, p);
98 |
99 | :: else -> skip
100 | fi
101 |
102 | // we're back to the proposer's logic now,
103 | // incorporating the recorder's "response".
104 | assert(s <= rec[i].s);
105 | if
106 | :: s == rec[i].s && (s & 3) == 0 ->
107 | g = MAX(g, rec[i].f); // gather props
108 | done = done && (FIT(rec[i].f) == HI);
109 |
110 | :: s == rec[i].s && (s & 3) == 1 -> skip
111 |
112 | :: s == rec[i].s && (s & 3) >= 2 ->
113 | printf("%d step %d got <%d,%d> from %d\n", j, s, FIT(rec[i].m), VAL(rec[i].m), i);
114 | g = MAX(g, rec[i].m); // gather E/C
115 |
116 | :: s < rec[i].s -> // catch up proposer
117 | s = rec[i].s;
118 | p = rec[i].f;
119 | break;
120 | fi
121 | assert(s == rec[i].s);
122 |
123 | recs++; // this recorder has now "replied"
124 | mask = mask | (1 << i);
125 |
126 | select (i : 1 .. N); // choose next recorder
127 |
128 | } // atomic
129 |
130 | :: recs < T && (mask & (1 << i)) != 0 ->
131 | // we've already gotten a reply from this recorder,
132 | // so just pick a different one.
133 | select (i : 1 .. N);
134 |
135 | :: recs == T -> // we've heard from a threshold of recorders
136 |
137 | if
138 | :: (s & 3) == 0 -> // propose phase
139 | assert(FIT(g) > 0 && VAL(g) > 0);
140 | p = g; // pick best of some E set
141 |
142 | // Decide early if all proposals were HI fit
143 | if
144 | :: done ->
145 | DECIDE(j, s, p);
146 | :: else -> skip
147 | fi
148 |
149 | :: (s & 3) == 1 -> skip // spreadE phase
150 |
151 | :: (s & 3) == 2 -> // gatherEspreadC phase
152 | // p is now the best of a U set;
153 | // g is the best of all gathered E sets
154 | assert(FIT(g) > 0 && VAL(g) > 0);
155 | if
156 | :: p == g ->
157 | DECIDE(j, s, p);
158 | :: else -> skip
159 | fi
160 |
161 | :: (s & 3) == 3 -> // gatherC phase
162 | // g is the best of all gathered C sets.
163 | // this is our proposal for the next round.
164 | assert(FIT(g) > 0 && VAL(g) > 0);
165 | p = g;
166 | fi
167 | s = s + 1;
168 | break;
169 | od
170 |
171 | :: s > STEPHI -> // we've simulated enough time-steps
172 | break;
173 | od
174 | }
175 |
176 | init {
177 | assert(HI < 1 << FITBITS);
178 | assert(VALS < 1 << VALBITS);
179 |
180 | decided = 0; // we haven't decided yet
181 |
182 | // first choose the "well-known" leader, or 0 for no leader
183 | //leader = 0; // no leader
184 | leader = 1; // fixed leader
185 | //select (leader : 0 .. M); // any (or no) leader
186 |
187 | atomic {
188 | int i;
189 | for (i : 1 .. M) { // Launch M proposers
190 | run Proposer(i)
191 | }
192 | }
193 | }
194 |
195 |
--------------------------------------------------------------------------------
/spin/qpm.pml:
--------------------------------------------------------------------------------
1 | // Simple model of QuePaxa consensus.
2 | // Uses explicit message-based communication with recorders.
3 |
4 | #define N 3 // total number of recorder (state) nodes
5 | #define F 1 // number of failures tolerated
6 | #define T (N-F) // consensus threshold required
7 |
8 | #define M 2 // number of proposers (clients)
9 |
10 | #define STEPHI 11 // highest step number to simulate
11 | #define RAND 2 // random part of fitness space is 1..RAND
12 | #define HI (RAND+1) // top priority for proposals by leader
13 | #define VALS 2 // space of preferred values is 1..VALS
14 |
15 | // A proposal is an integer divided into two bit-fields: fitness and value.
16 | #define VALBITS 4
17 | #define FITBITS 4
18 | #define VALSHIFT (0)
19 | #define FITSHIFT (VALBITS)
20 | #define PROP(f,v) (((f)<> VALSHIFT) & ((1 << VALBITS)-1))
22 | #define FIT(p) (((p) >> FITSHIFT) & ((1 << FITBITS)-1))
23 |
24 | #define MAX(a, b) ((a) > (b) -> (a) : (b))
25 |
26 | byte leader; // which proposer is the well-known leader
27 | byte decided; // proposed value that we've decided on
28 | byte propsdone; // number of proposers that have finished
29 |
30 | // Channels for recorder/proposer communication.
31 | chan creq[1+N] = [0] of { byte, byte, byte } //
32 | chan crep[1+M] = [0] of { byte, byte, byte, byte, byte} //
33 |
34 | #define DECIDE(j, s, p) atomic { \
35 | printf("%d step %d decided <%d,%d>", j, s, FIT(p), VAL(p)); \
36 | assert(decided == 0 || decided == VAL(p)); \
37 | decided = VAL(p); \
38 | }
39 |
40 | // Each proposer is a process.
41 | proctype Proposer(byte j) { // We're proposer j in 1..M
42 | byte s;
43 | byte p, g;
44 | byte ri, rs, rsn, rfn, rmn; // responses we get from recorders
45 | byte i, sent, recs; // request send and reply receiving state
46 | bit done; // for detecting early-decision opportunities
47 |
48 | // Choose the arbitrary initial "preferred value" of this proposer
49 | s = 4;
50 | select (p : 1 .. VALS); // select a "random" value into temporary
51 | printf("%d proposing %d\n", j, p);
52 | p = PROP(HI, p);
53 |
54 | // Initialize per-step state for the first step of the first round.
55 | printf("%d step %d\n", j, s);
56 | sent = 0; // bit mask of recorders we've sent to
57 | recs = 0; // number of recorders we've heard from
58 | g = 0; // gather best response proposer saw so far
59 | done = true;
60 |
61 | i = 0; // first, send to a channel no one listens on
62 | do
63 | :: creq[i] ! j, s, p -> // send a request for this step to recorder i
64 | printf("%d step %d sent <%d,%d> to %d\n",
65 | j, s, FIT(p), VAL(p), i);
66 | sent = sent | (1 << i); // successfully sent
67 | i = 0; // now we have no target again
68 |
69 | :: s <= STEPHI && recs < T -> // choose a recorder to send to
70 |
71 | // randomize fitness in phase 0 if we're not the leader
72 | if
73 | :: (s & 3) == 0 && j != leader ->
74 | byte r;
75 | select(r : 1 .. RAND);
76 | p = PROP(r, VAL(p));
77 | :: else -> skip
78 | fi
79 | assert(FIT(p) > 0 && VAL(p) > 0);
80 |
81 | // choose a recorder that we haven't already sent a request to
82 | // revert to i=0 if we've already sent to selected recorder
83 | select (i : 1 .. N);
84 | i = ((sent & (1 << i)) == 0 -> i : 0);
85 |
86 | :: crep[j] ? ri, rs, rsn, rfn, rmn -> // get response from a recorder
87 | printf("%d step %d recv %d %d <%d,%d>,<%d,%d> from %d\n",
88 | j, s, rs, rsn, FIT(rfn), VAL(rfn),
89 | FIT(rmn), VAL(rmn), i);
90 | assert(rs <= s); // should get replies only to requests
91 | if
92 | :: rs < s -> skip // discard old unneeded replies
93 |
94 | :: rs == s && rsn > s -> // catch up to new recorder state
95 | s = rsn; // adopt recorder's round start state
96 | p = rfn;
97 |
98 | // initialize per-step state for the new time-step
99 | printf("%d step %d\n", j, s);
100 | sent = 0; // bit mask of recorders we've sent to
101 | recs = 0; // number of recorders we've heard from
102 | g = 0; // best response proposer saw so far
103 | done = true;
104 |
105 | :: rs == s && rsn == s && (s & 3) == 0 -> // propose phase
106 | g = MAX(g, rfn); // gather best of all first proposals
107 | done = done && (FIT(rfn) == HI);
108 | recs++; // this recorder has now replied
109 |
110 | :: rs == s && rsn == s && (s & 3) == 1 -> // spread E phase
111 | recs++; // this recorder has now replied
112 |
113 | :: rs == s && rsn == s && (s & 3) >= 2 -> // gather E spread C
114 | g = MAX(g, rmn); // gather best of E or C sets
115 | recs++; // this recorder has now replied
116 | fi
117 | assert(recs <= N); // shouldn't get any extra replies
118 |
119 | ri = 0; // clear temporaries
120 | rs = 0;
121 | rsn = 0;
122 | rfn = 0;
123 | rmn = 0;
124 |
125 | :: s <= STEPHI && recs >= T -> // got a quorum of replies
126 |
127 | // handle the proposer's completion of this round
128 | if
129 | :: (s & 3) == 0 -> // propose phase
130 | assert(FIT(g) > 0 && VAL(g) > 0);
131 | p = g; // pick best of some E set
132 |
133 | // Decide early if all proposals were HI fit
134 | if
135 | :: done ->
136 | DECIDE(j, s, p);
137 | :: else -> skip
138 | fi
139 |
140 | :: (s & 3) == 1 -> skip // spread E phase: nothing to do
141 |
142 | :: (s & 3) == 2 -> // gather E spread C phase
143 | // p is now the best of some universal (U) set;
144 | // g is the best of all the E sets we gathered.
145 | assert(FIT(g) > 0 && VAL(g) > 0);
146 | if
147 | :: p == g ->
148 | DECIDE(j, s, p);
149 | :: else -> skip
150 | fi
151 |
152 | :: (s & 3) == 3 -> // gather C phase
153 | // g is the best of all common (C) sets we gathered;
154 | // this becomes our proposal for the next round.
155 | assert(FIT(g) > 0 && VAL(g) > 0);
156 | p = g;
157 | fi
158 |
159 | // proceed to next logical time-step
160 | s = s + 1;
161 |
162 | // initialize per-step state for the new time-step
163 | printf("%d step %d\n", j, s);
164 | sent = 0; // bit mask of recorders we've sent to
165 | recs = 0; // number of recorders we've heard from
166 | g = 0; // best response proposer saw so far
167 | done = true;
168 |
169 | :: s > STEPHI -> // we've simulated enough time-steps
170 | break;
171 | od
172 |
173 | // count terminated proposers so recorders can terminate too
174 | atomic {
175 | propsdone++;
176 | }
177 | }
178 |
179 | // Each recorder is a process implementing an interval summary register (ISR).
180 | proctype Recorder(byte i) { // We're proposer j in 1..M
181 | byte s, f, a, m;
182 | byte rj, rs, rv;
183 |
184 | do
185 | :: creq[i] ? rj, rs, rv -> // got request from proposer rj
186 | if
187 | :: rs == s ->
188 | a = MAX(a, rv); // accumulate max of all values
189 |
190 | :: rs > s -> // forward to a later step
191 | m = (rs == s+1 -> a : 0);
192 | s = rs;
193 | f = rv;
194 | a = rv;
195 |
196 | :: else -> skip
197 | fi
198 |
199 | // send reply to the proposer --
200 | // but don't block forever if all proposers terminate.
201 | if
202 | :: crep[rj] ! i, rs, s, f, m // reply succeeded
203 | :: propsdone == M -> break // done while trying to send
204 | fi
205 |
206 | rj = 0; // clear temporaries
207 | rs = 0;
208 | rv = 0;
209 |
210 | :: propsdone == M -> // all proposers terminated?
211 | break; // terminate recorder thread
212 | od
213 | }
214 |
215 | // The initialization process just gets things launched.
216 | init {
217 | assert(HI < 1 << FITBITS);
218 | assert(VALS < 1 << VALBITS);
219 |
220 | decided = 0; // we haven't decided yet
221 |
222 | // first choose the "well-known" leader, or 0 for no leader
223 | //leader = 0; // no leader
224 | leader = 1; // fixed leader
225 | //select (leader : 0 .. M); // any (or no) leader
226 |
227 | atomic {
228 | int i, j;
229 |
230 | for (i : 1 .. N) { // Launch N recorders
231 | run Recorder(i)
232 | }
233 | for (j : 1 .. M) { // Launch M proposers
234 | run Proposer(j)
235 | }
236 | }
237 | }
238 |
239 |
--------------------------------------------------------------------------------
/spin/qsc.pml:
--------------------------------------------------------------------------------
1 |
2 | #define N 3 // total number of nodes
3 | #define Fa 1 // max number of availability failures
4 | #define Fc 0 // max number of correctness failures
5 | #define T (Fa+Fc+1) // consensus threshold required
6 |
7 | #define STEPS 3 // TLC time-steps per consensus round
8 | #define ROUNDS 2 // number of consensus rounds to run
9 | #define TICKETS 3 // proposal lottery ticket space
10 |
11 | // TLC state for each logical time-step
12 | typedef Step {
13 | bit sent; // true if we've sent our raw proposal
14 | bit seen[1+N]; // nodes whose raw proposals we've received
15 | bit ackd[1+N]; // nodes who have acknowledged our raw proposal
16 | bit witd; // true if our proposal is threshold witnessed
17 | bit witn[1+N]; // nodes we've gotten threshold witnessed msgs from
18 | }
19 |
20 | // QSC summary information for a "best" proposal seen so far
21 | typedef Best {
22 | byte from; // node number the proposal is from, 0 if tied spoiler
23 | byte tkt; // proposal's genetic fitness ticket value
24 | }
25 |
26 | // TLC and QSC state per round
27 | typedef Round {
28 | Step step[STEPS]; // TLC state for each logical time-step
29 |
30 | byte ticket; // QSC lottery ticket assigned to proposal at t+0
31 | Best spoil; // best potential spoiler(s) we've found so far
32 | Best conf; // best confirmed proposal we've seen so far
33 | Best reconf; // best reconfirmed proposal we've seen so far
34 | byte picked; // which proposal this node picked this round, 0 if not yet
35 | }
36 |
37 | // Per-node state
38 | typedef Node {
39 | Round rnd[ROUNDS]; // each node's per-consensus-round information
40 | }
41 |
42 | Node node[1+N]; // all state of each node 1..N
43 |
44 |
45 | // Implement a given node i.
46 | proctype NodeProc(byte i) {
47 | byte j, r, s, tkt, step, acks, wits;
48 |
49 | for (r : 0 .. ROUNDS-1) {
50 |
51 | atomic {
52 | // select a "random" (here just arbitrary) ticket
53 | select (tkt : 1 .. TICKETS);
54 | node[i].rnd[r].ticket = tkt;
55 |
56 | // start with our own proposal as best potential spoiler
57 | node[i].rnd[r].spoil.from = i;
58 | node[i].rnd[r].spoil.tkt = tkt;
59 | } // atomic
60 |
61 | // Run the round to completion
62 | for (s : 0 .. STEPS-1) {
63 |
64 | // "send" the broadcast for this time-step
65 | node[i].rnd[r].step[s].sent = 1;
66 |
67 | // collect a threshold of other nodes' broadcasts
68 | acks = 0;
69 | wits = 0;
70 | do
71 | :: // Pick another node to "receive" a message from
72 | select (j : 1 .. N);
73 | atomic {
74 |
75 | // Track the best potential spoiler we encounter
76 | if
77 | // Node j knows about a strictly better potential spoiler
78 | :: node[j].rnd[r].spoil.tkt > node[i].rnd[r].spoil.tkt ->
79 | node[i].rnd[r].spoil.from = node[j].rnd[r].spoil.from;
80 | node[i].rnd[r].spoil.tkt = node[j].rnd[r].spoil.tkt;
81 |
82 | // Node j knows about a spoiler that's tied with our best
83 | :: node[j].rnd[r].spoil.tkt == node[i].rnd[r].spoil.tkt &&
84 | node[j].rnd[r].spoil.from != node[i].rnd[r].spoil.from ->
85 | node[i].rnd[r].spoil.from = 0; // tied, so mark invalid
86 |
87 | :: else -> skip
88 | fi
89 |
90 | // Track the best confirmed proposal we encounter
91 | if
92 | :: node[j].rnd[r].conf.tkt > node[i].rnd[r].conf.tkt ->
93 | node[i].rnd[r].conf.from = node[j].rnd[r].conf.from;
94 | node[i].rnd[r].conf.tkt = node[j].rnd[r].conf.tkt;
95 | :: else -> skip
96 | fi
97 |
98 | // Track the best reconfirmed proposal we encounter
99 | if
100 | :: node[j].rnd[r].reconf.tkt > node[i].rnd[r].reconf.tkt ->
101 | node[i].rnd[r].reconf.from = node[j].rnd[r].reconf.from;
102 | node[i].rnd[r].reconf.tkt = node[j].rnd[r].reconf.tkt;
103 | :: else -> skip
104 | fi
105 |
106 | // Now handle specific types of messages: Raw, Ack, or Wit.
107 | if
108 |
109 | // We "receive" a raw unwitnessed message from node j
110 | :: node[j].rnd[r].step[s].sent && !node[i].rnd[r].step[s].seen[j] ->
111 |
112 | node[i].rnd[r].step[s].seen[j] = 1;
113 |
114 | // We "receive" an acknowledgment of our message from node j
115 | :: node[j].rnd[r].step[s].seen[i] && !node[i].rnd[r].step[s].ackd[j] ->
116 |
117 | node[i].rnd[r].step[s].ackd[j] = 1;
118 | acks++;
119 | if
120 | :: acks >= T ->
121 | // Our proposal is now fully threshold witnessed
122 | node[i].rnd[r].step[s].witd = 1
123 |
124 | // See if our proposal is now the best confirmed proposal
125 | if
126 | :: s == 0 &&
127 | node[i].rnd[r].ticket > node[i].rnd[r].conf.tkt ->
128 | node[i].rnd[r].conf.from = i;
129 | node[i].rnd[r].conf.tkt = node[i].rnd[r].ticket;
130 | :: else -> skip
131 | fi
132 |
133 | // See if we're reconfirming a best confirmed proposal
134 | if
135 | :: s == 1 &&
136 | node[i].rnd[r].conf.tkt > node[i].rnd[r].reconf.tkt ->
137 | node[i].rnd[r].reconf.from = node[i].rnd[r].conf.from;
138 | node[i].rnd[r].reconf.tkt = node[i].rnd[r].conf.tkt;
139 | :: else -> skip
140 | fi
141 |
142 | :: else -> skip
143 | fi
144 |
145 | // We "receive" a fully threshold witnessed message from node j
146 | :: node[j].rnd[r].step[s].witd && !node[i].rnd[r].step[s].witn[j] ->
147 |
148 | node[i].rnd[r].step[s].witn[j] = 1
149 | wits++;
150 |
151 | // End this step if we've seen enough witnessed proposals
152 | :: wits >= T -> break;
153 |
154 | :: else -> skip
155 | fi
156 | } // atomic
157 | od
158 | }
159 |
160 | atomic {
161 | printf("%d best spoiler %d ticket %d\n",
162 | i, node[i].rnd[r].spoil.from, node[i].rnd[r].spoil.tkt);
163 | printf("%d best confirmed %d ticket %d\n",
164 | i, node[i].rnd[r].conf.from, node[i].rnd[r].conf.tkt);
165 | printf("%d best reconfirmed %d ticket %d\n",
166 | i, node[i].rnd[r].reconf.from, node[i].rnd[r].reconf.tkt);
167 |
168 | // The round is now complete in terms of picking a proposal.
169 | node[i].rnd[r].picked = node[i].rnd[r].conf.from;
170 |
171 | // We can be sure everyone has converged on this proposal
172 | // if it is also the best spoiler and best reconfirmed proposal.
173 | if
174 | :: node[i].rnd[r].spoil.from == node[i].rnd[r].picked &&
175 | node[i].rnd[r].reconf.from == node[i].rnd[r].picked ->
176 | printf("%d round %d definitely COMMITTED\n", i, r);
177 |
178 | // Verify that what we decided doesn't conflict with
179 | // the proposal any other node chooses.
180 | select (j : 1 .. N);
181 | assert(!node[j].rnd[r].picked ||
182 | (node[j].rnd[r].picked == node[i].rnd[r].picked));
183 |
184 | :: node[i].rnd[r].reconf.from != node[i].rnd[r].picked ->
185 | printf("%d round %d FAILED to be reconfirmed\n", i, r);
186 |
187 | :: node[i].rnd[r].spoil.from != node[i].rnd[r].picked ->
188 | printf("%d round %d FAILED due to spoiler\n", i, r);
189 |
190 | :: node[i].rnd[r].spoil.from == 0 ->
191 | printf("%d round %d FAILED due to tie\n", i, r);
192 |
193 | :: else ->
194 | fi
195 | } // atomic
196 | }
197 | }
198 |
199 | init {
200 | atomic {
201 | int i;
202 | for (i : 1 .. N) {
203 | run NodeProc(i)
204 | }
205 | }
206 | }
207 |
208 |
--------------------------------------------------------------------------------
/spin/results-qp.txt:
--------------------------------------------------------------------------------
1 | qp.pml verification:
2 |
3 | Bitstate verification using spin -search -O2 -safety -DMEMLIM=60000 $1
4 | Results from running on Bryan's 2019 MacBook Pro M1 Max.
5 |
6 | ---
7 | 7 steps (1 full consensus round in steps 4-7):
8 |
9 | Depth= 180 States= 1.4e+07 Transitions= 2.51e+07 Memory= 1088.105 t= 5.64 R= 2e+06
10 |
11 | (Spin Version 6.5.2 -- 6 December 2019)
12 | + Partial Order Reduction
13 |
14 | Full statespace search for:
15 | never claim - (none specified)
16 | assertion violations +
17 | cycle checks - (disabled by -DSAFETY)
18 | invalid end states +
19 |
20 | State-vector 60 byte, depth reached 180, errors: 0
21 | 14561376 states, stored
22 | 11554544 states, matched
23 | 26115920 transitions (= stored+matched)
24 | 14444193 atomic steps
25 | hash conflicts: 3406143 (resolved)
26 |
27 | Stats on memory usage (in Megabytes):
28 | 1222.039 equivalent memory usage for states (stored*(State-vector + overhead))
29 | 998.371 actual memory usage for states (compression: 81.70%)
30 | state-vector as stored = 44 byte + 28 byte overhead
31 | 128.000 memory used for hash table (-w24)
32 | 0.534 memory used for DFS stack (-m10000)
33 | 1126.581 total actual memory usage
34 |
35 |
36 | unreached in proctype Proposer
37 | (0 of 114 states)
38 | unreached in init
39 | (0 of 16 states)
40 |
41 | pan: elapsed time 5.92 seconds
42 | pan: rate 2459691.9 states/second
43 |
44 |
45 | ---
46 | 8 steps:
47 |
48 | Depth= 220 States= 4.3e+07 Transitions= 7.36e+07 Memory= 3559.675 t= 22.4 R= 2e+06
49 |
50 | (Spin Version 6.5.2 -- 6 December 2019)
51 | + Partial Order Reduction
52 |
53 | Full statespace search for:
54 | never claim - (none specified)
55 | assertion violations +
56 | cycle checks - (disabled by -DSAFETY)
57 | invalid end states +
58 |
59 | State-vector 60 byte, depth reached 220, errors: 0
60 | 43443684 states, stored
61 | 31011944 states, matched
62 | 74455628 transitions (= stored+matched)
63 | 38765865 atomic steps
64 | hash conflicts: 24394276 (resolved)
65 |
66 | Stats on memory usage (in Megabytes):
67 | 3645.939 equivalent memory usage for states (stored*(State-vector + overhead))
68 | 3078.502 actual memory usage for states (compression: 84.44%)
69 | state-vector as stored = 46 byte + 28 byte overhead
70 | 512.000 memory used for hash table (-w26)
71 | 0.534 memory used for DFS stack (-m10000)
72 | 3590.144 total actual memory usage
73 |
74 |
75 | unreached in proctype Proposer
76 | (0 of 114 states)
77 | unreached in init
78 | (0 of 16 states)
79 |
80 | pan: elapsed time 22.6 seconds
81 | pan: rate 1919738.6 states/second
82 |
83 |
84 | ---
85 | 9 steps:
86 |
87 | Depth= 262 States= 1.17e+08 Transitions= 2.12e+08 Memory= 8642.683 t= 67.9 R= 2e+06
88 |
89 | (Spin Version 6.5.2 -- 6 December 2019)
90 | + Partial Order Reduction
91 |
92 | Full statespace search for:
93 | never claim - (none specified)
94 | assertion violations +
95 | cycle checks - (disabled by -DSAFETY)
96 | invalid end states +
97 |
98 | State-vector 60 byte, depth reached 262, errors: 0
99 | 1.1701493e+08 states, stored
100 | 95266996 states, matched
101 | 2.1228193e+08 transitions (= stored+matched)
102 | 1.3433094e+08 atomic steps
103 | hash conflicts: 80036981 (resolved)
104 |
105 | Stats on memory usage (in Megabytes):
106 | 9820.284 equivalent memory usage for states (stored*(State-vector + overhead))
107 | 8133.282 actual memory usage for states (compression: 82.82%)
108 | state-vector as stored = 45 byte + 28 byte overhead
109 | 512.000 memory used for hash table (-w26)
110 | 0.534 memory used for DFS stack (-m10000)
111 | 2.157 memory lost to fragmentation
112 | 8643.659 total actual memory usage
113 |
114 |
115 | unreached in proctype Proposer
116 | (0 of 114 states)
117 | unreached in init
118 | (0 of 16 states)
119 |
120 | pan: elapsed time 67.9 seconds
121 | pan: rate 1722834.7 states/second
122 |
123 |
124 | ---
125 | 10 steps:
126 |
127 | Depth= 302 States= 1.93e+08 Transitions= 3.66e+08 Memory= 15855.624 t= 121 R= 2e+06
128 |
129 | (Spin Version 6.5.2 -- 6 December 2019)
130 | + Partial Order Reduction
131 |
132 | Full statespace search for:
133 | never claim - (none specified)
134 | assertion violations +
135 | cycle checks - (disabled by -DSAFETY)
136 | invalid end states +
137 |
138 | State-vector 60 byte, depth reached 302, errors: 0
139 | 1.9397366e+08 states, stored
140 | 1.7384229e+08 states, matched
141 | 3.6781595e+08 transitions (= stored+matched)
142 | 2.374664e+08 atomic steps
143 | hash conflicts: 1.350289e+08 (resolved)
144 |
145 | Stats on memory usage (in Megabytes):
146 | 16278.918 equivalent memory usage for states (stored*(State-vector + overhead))
147 | 13877.107 actual memory usage for states (compression: 85.25%)
148 | state-vector as stored = 47 byte + 28 byte overhead
149 | 2048.000 memory used for hash table (-w28)
150 | 0.534 memory used for DFS stack (-m10000)
151 | 3.122 memory lost to fragmentation
152 | 15922.519 total actual memory usage
153 |
154 |
155 | unreached in proctype Proposer
156 | (0 of 114 states)
157 | unreached in init
158 | (0 of 16 states)
159 |
160 | pan: elapsed time 122 seconds
161 | pan: rate 1595047 states/second
162 |
163 |
164 | ---
165 | 11 steps (2 full consensus rounds: steps 4-7 and 8-11):
166 |
167 | Depth= 338 States= 2.45e+08 Transitions= 4.68e+08 Memory= 19425.351 t= 153 R= 2e+06
168 |
169 | (Spin Version 6.5.2 -- 6 December 2019)
170 | + Partial Order Reduction
171 |
172 | Full statespace search for:
173 | never claim - (none specified)
174 | assertion violations +
175 | cycle checks - (disabled by -DSAFETY)
176 | invalid end states +
177 |
178 | State-vector 60 byte, depth reached 338, errors: 0
179 | 2.4529035e+08 states, stored
180 | 2.2295857e+08 states, matched
181 | 4.6824892e+08 transitions (= stored+matched)
182 | 3.0213691e+08 atomic steps
183 | hash conflicts: 1.5778641e+08 (resolved)
184 |
185 | Stats on memory usage (in Megabytes):
186 | 20585.585 equivalent memory usage for states (stored*(State-vector + overhead))
187 | 17400.834 actual memory usage for states (compression: 84.53%)
188 | state-vector as stored = 46 byte + 28 byte overhead
189 | 2048.000 memory used for hash table (-w28)
190 | 0.534 memory used for DFS stack (-m10000)
191 | 4.096 memory lost to fragmentation
192 | 19445.272 total actual memory usage
193 |
194 |
195 | unreached in proctype Proposer
196 | (0 of 114 states)
197 | unreached in init
198 | (0 of 16 states)
199 |
200 | pan: elapsed time 153 seconds
201 | pan: rate 1600380.7 states/second
202 |
203 |
204 | ---
205 | 12 steps:
206 |
207 | Depth= 378 States= 3.85e+08 Transitions= 7.09e+08 Memory= 28987.069 t= 244 R= 2e+06
208 |
209 | (Spin Version 6.5.2 -- 6 December 2019)
210 | + Partial Order Reduction
211 |
212 | Full statespace search for:
213 | never claim - (none specified)
214 | assertion violations +
215 | cycle checks - (disabled by -DSAFETY)
216 | invalid end states +
217 |
218 | State-vector 60 byte, depth reached 378, errors: 0
219 | 3.8578596e+08 states, stored
220 | 3.2452935e+08 states, matched
221 | 7.1031531e+08 transitions (= stored+matched)
222 | 4.3387088e+08 atomic steps
223 | hash conflicts: 2.5898853e+08 (resolved)
224 |
225 | Stats on memory usage (in Megabytes):
226 | 32376.446 equivalent memory usage for states (stored*(State-vector + overhead))
227 | 26998.934 actual memory usage for states (compression: 83.39%)
228 | state-vector as stored = 45 byte + 28 byte overhead
229 | 2048.000 memory used for hash table (-w28)
230 | 0.534 memory used for DFS stack (-m10000)
231 | 6.396 memory lost to fragmentation
232 | 29041.073 total actual memory usage
233 |
234 |
235 | unreached in proctype Proposer
236 | (0 of 114 states)
237 | unreached in init
238 | (0 of 16 states)
239 |
240 | pan: elapsed time 244 seconds
241 | pan: rate 1579665.7 states/second
242 |
243 |
--------------------------------------------------------------------------------
/spin/results-qpm.txt:
--------------------------------------------------------------------------------
1 | qpm.pml verification:
2 |
3 | Bitstate verification using spin -search -O2 -safety -bitstate -w38 $1
4 | (32GB state hash table).
5 | Results from running on Bryan's 2019 MacBook Pro M1 Max.
6 |
7 | ---
8 | 4 steps:
9 |
10 | Depth= 1007 States= 3e+06 Transitions= 4.93e+06 Memory= 32768.925 t= 3.14 R= 1e+06
11 |
12 | (Spin Version 6.5.2 -- 6 December 2019)
13 | + Partial Order Reduction
14 |
15 | Bit statespace search for:
16 | never claim - (none specified)
17 | assertion violations +
18 | cycle checks - (disabled by -DSAFETY)
19 | invalid end states +
20 |
21 | State-vector 168 byte, depth reached 1007, errors: 0
22 | 3871180 states, stored
23 | 2503227 states, matched
24 | 6374407 transitions (= stored+matched)
25 | 20 atomic steps
26 |
27 | hash factor: 71006.2 (best if > 100.)
28 |
29 | bits set per state: 3 (-k3)
30 |
31 | Stats on memory usage (in Megabytes):
32 | 694.067 equivalent memory usage for states (stored*(State-vector + overhead))
33 | 32768.000 memory used for hash array (-w38)
34 | 0.076 memory used for bit stack
35 | 0.534 memory used for DFS stack (-m10000)
36 | 32768.925 total actual memory usage
37 |
38 |
39 | unreached in proctype Proposer
40 | qpm.pml:115, state 58, "recs = (recs+1)"
41 | qpm.pml:140, state 81, "(1)"
42 | qpm.pml:148, state 87, "decided = ((p>>0)&((1<<4)-1))"
43 | qpm.pml:149, state 90, "(1)"
44 | qpm.pml:147, state 91, "((p==g))"
45 | qpm.pml:147, state 91, "else"
46 | qpm.pml:156, state 95, "p = g"
47 | (6 of 111 states)
48 | unreached in proctype Recorder
49 | (0 of 26 states)
50 | unreached in init
51 | (0 of 26 states)
52 |
53 | pan: elapsed time 3.96 seconds
54 | pan: rate 977570.71 states/second
55 |
56 |
57 | ---
58 | 5 steps:
59 |
60 | Depth= 1743 States= 2.13e+08 Transitions= 3.74e+08 Memory= 32769.120 t= 213 R= 1e+06
61 |
62 | (Spin Version 6.5.2 -- 6 December 2019)
63 | + Partial Order Reduction
64 |
65 | Bit statespace search for:
66 | never claim - (none specified)
67 | assertion violations +
68 | cycle checks - (disabled by -DSAFETY)
69 | invalid end states +
70 |
71 | State-vector 168 byte, depth reached 1743, errors: 0
72 | 2.1362823e+08 states, stored
73 | 1.6166948e+08 states, matched
74 | 3.7529772e+08 transitions (= stored+matched)
75 | 20 atomic steps
76 |
77 | hash factor: 1286.71 (best if > 100.)
78 |
79 | bits set per state: 3 (-k3)
80 |
81 | Stats on memory usage (in Megabytes):
82 | 38301.571 equivalent memory usage for states (stored*(State-vector + overhead))
83 | 32768.000 memory used for hash array (-w38)
84 | 0.076 memory used for bit stack
85 | 0.534 memory used for DFS stack (-m10000)
86 | 32769.120 total actual memory usage
87 |
88 |
89 | unreached in proctype Proposer
90 | qpm.pml:148, state 87, "decided = ((p>>0)&((1<<4)-1))"
91 | qpm.pml:149, state 90, "(1)"
92 | qpm.pml:147, state 91, "((p==g))"
93 | qpm.pml:147, state 91, "else"
94 | qpm.pml:156, state 95, "p = g"
95 | (4 of 111 states)
96 | unreached in proctype Recorder
97 | (0 of 26 states)
98 | unreached in init
99 | (0 of 26 states)
100 |
101 | pan: elapsed time 213 seconds
102 | pan: rate 1001116.4 states/second
103 |
104 | ---
105 | 6 steps:
106 |
107 | Depth= 2323 States= 1.19e+09 Transitions= 2.14e+09 Memory= 32769.218 t= 1.11e+03 R= 1e+06
108 |
109 | (Spin Version 6.5.2 -- 6 December 2019)
110 | + Partial Order Reduction
111 |
112 | Bit statespace search for:
113 | never claim - (none specified)
114 | assertion violations +
115 | cycle checks - (disabled by -DSAFETY)
116 | invalid end states +
117 |
118 | State-vector 168 byte, depth reached 2323, errors: 0
119 | 1.1925986e+09 states, stored
120 | 9.5240049e+08 states, matched
121 | 2.1449991e+09 transitions (= stored+matched)
122 | 20 atomic steps
123 |
124 | hash factor: 230.487 (best if > 100.)
125 |
126 | bits set per state: 3 (-k3)
127 |
128 | Stats on memory usage (in Megabytes):
129 | 213821.928 equivalent memory usage for states (stored*(State-vector + overhead))
130 | 32768.000 memory used for hash array (-w38)
131 | 0.076 memory used for bit stack
132 | 0.534 memory used for DFS stack (-m10000)
133 | 32769.218 total actual memory usage
134 |
135 |
136 | unreached in proctype Proposer
137 | qpm.pml:156, state 95, "p = g"
138 | (1 of 111 states)
139 | unreached in proctype Recorder
140 | (0 of 26 states)
141 | unreached in init
142 | (0 of 26 states)
143 |
144 | pan: elapsed time 1.11e+03 seconds
145 | pan: rate 1070699.5 states/second
146 |
147 | ---
148 | 7 steps:
149 |
150 | Depth= 3018 States= 3.57e+09 Transitions= 6.48e+09 Memory= 32769.315 t= 3.44e+03 R= 1e+06
151 |
152 | (Spin Version 6.5.2 -- 6 December 2019)
153 | + Partial Order Reduction
154 |
155 | Bit statespace search for:
156 | never claim - (none specified)
157 | assertion violations +
158 | cycle checks - (disabled by -DSAFETY)
159 | invalid end states +
160 |
161 | State-vector 168 byte, depth reached 3018, errors: 0
162 | 3.5701183e+09 states, stored
163 | 2.9060201e+09 states, matched
164 | 6.4761385e+09 transitions (= stored+matched)
165 | 20 atomic steps
166 |
167 | hash factor: 76.9941 (best if > 100.)
168 |
169 | bits set per state: 3 (-k3)
170 |
171 | Stats on memory usage (in Megabytes):
172 | 640089.267 equivalent memory usage for states (stored*(State-vector + overhead))
173 | 32768.000 memory used for hash array (-w38)
174 | 0.076 memory used for bit stack
175 | 0.534 memory used for DFS stack (-m10000)
176 | 32769.315 total actual memory usage
177 |
178 |
179 | unreached in proctype Proposer
180 | (0 of 111 states)
181 | unreached in proctype Recorder
182 | (0 of 26 states)
183 | unreached in init
184 | (0 of 26 states)
185 |
186 | pan: elapsed time 3.44e+03 seconds
187 | pan: rate 1037038.3 states/second
188 |
189 |
190 | ---
191 | 8 steps:
192 |
193 | Depth= 3741 States= 1.55e+10 Transitions= 2.7e+10 Memory= 32769.511 t= 1.6e+04 R= 1e+06
194 |
195 | (Spin Version 6.5.2 -- 6 December 2019)
196 | + Partial Order Reduction
197 |
198 | Bit statespace search for:
199 | never claim - (none specified)
200 | assertion violations +
201 | cycle checks - (disabled by -DSAFETY)
202 | invalid end states +
203 |
204 | State-vector 168 byte, depth reached 3741, errors: 0
205 | 1.5529605e+10 states, stored
206 | 1.1502249e+10 states, matched
207 | 2.7031855e+10 transitions (= stored+matched)
208 | 20 atomic steps
209 |
210 | hash factor: 17.7003 (best if > 100.)
211 |
212 | bits set per state: 3 (-k3)
213 |
214 | Stats on memory usage (in Megabytes):
215 | 2784314.887 equivalent memory usage for states (stored*(State-vector + overhead))
216 | 32768.000 memory used for hash array (-w38)
217 | 0.076 memory used for bit stack
218 | 0.534 memory used for DFS stack (-m10000)
219 | 32769.511 total actual memory usage
220 |
221 |
222 | unreached in proctype Proposer
223 | (0 of 111 states)
224 | unreached in proctype Recorder
225 | (0 of 26 states)
226 | unreached in init
227 | (0 of 26 states)
228 |
229 | pan: elapsed time 1.6e+04 seconds
230 | pan: rate 969476.33 states/second
231 |
232 | ---
233 | 10 steps:
234 |
235 | Depth= 4912 States= 6.2e+10 Transitions= 1.1e+11 Memory= 32769.706 t= 1.06e+05 R= 6e+05
236 |
237 | (Spin Version 6.5.2 -- 6 December 2019)
238 | + Partial Order Reduction
239 |
240 | Bit statespace search for:
241 | never claim - (none specified)
242 | assertion violations +
243 | cycle checks - (disabled by -DSAFETY)
244 | invalid end states +
245 |
246 | State-vector 168 byte, depth reached 4912, errors: 0
247 | 6.1979144e+10 states, stored
248 | 4.7682823e+10 states, matched
249 | 1.0966197e+11 transitions (= stored+matched)
250 | 20 atomic steps
251 |
252 | hash factor: 4.43501 (best if > 100.)
253 |
254 | bits set per state: 3 (-k3)
255 |
256 | Stats on memory usage (in Megabytes):
257 | 11112288.506 equivalent memory usage for states (stored*(State-vector + overhead))
258 | 32768.000 memory used for hash array (-w38)
259 | 0.076 memory used for bit stack
260 | 0.534 memory used for DFS stack (-m10000)
261 | 1.014 other (proc and chan stacks)
262 | 32769.706 total actual memory usage
263 |
264 |
265 | unreached in proctype Proposer
266 | (0 of 111 states)
267 | unreached in proctype Recorder
268 | (0 of 26 states)
269 | unreached in init
270 | (0 of 26 states)
271 |
272 | pan: elapsed time 1.06e+05 seconds
273 | pan: rate 583824.35 states/second
274 |
275 |
276 | ---
277 | 11 steps:
278 |
279 |
280 | Depth= 5465 States= 7.1e+10 Transitions= 1.25e+11 Memory= 32769.804 t= 8.45e+04 R= 8e+05
281 |
282 | (Spin Version 6.5.2 -- 6 December 2019)
283 | + Partial Order Reduction
284 |
285 | Bit statespace search for:
286 | never claim - (none specified)
287 | assertion violations +
288 | cycle checks - (disabled by -DSAFETY)
289 | invalid end states +
290 |
291 | State-vector 168 byte, depth reached 5465, errors: 0
292 | 7.0950964e+10 states, stored
293 | 5.4281076e+10 states, matched
294 | 1.2523204e+11 transitions (= stored+matched)
295 | 20 atomic steps
296 |
297 | hash factor: 3.8742 (best if > 100.)
298 |
299 | bits set per state: 3 (-k3)
300 |
301 | Stats on memory usage (in Megabytes):
302 | 12720853.000 equivalent memory usage for states (stored*(State-vector + overhead))
303 | 32768.000 memory used for hash array (-w38)
304 | 0.076 memory used for bit stack
305 | 0.534 memory used for DFS stack (-m10000)
306 | 1.111 other (proc and chan stacks)
307 | 32769.804 total actual memory usage
308 |
309 |
310 | unreached in proctype Proposer
311 | (0 of 111 states)
312 | unreached in proctype Recorder
313 | (0 of 26 states)
314 | unreached in init
315 | (0 of 26 states)
316 |
317 | pan: elapsed time 8.45e+04 seconds
318 | pan: rate 839344.87 states/second
319 |
320 |
321 | ---
322 | 12 steps:
323 |
324 |
325 |
326 |
--------------------------------------------------------------------------------
/spin/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # Analyze the consensus model using the Spin model checker.
3 |
4 | # Exhaustive verification.
5 | # MEMLIMIT is the memory-usage limit in megabytes.
6 | #spin -search -O2 -safety -DMEMLIM=60000 $1
7 |
8 | # Set maximum search depth (-m), making it an error to exceed this depth (-b).
9 | #spin -search -O2 -safety -DMEMLIM=60000 -m3870 -b $1
10 |
11 | # Exhaustive verification with state vector compression.
12 | #spin -search -O2 -safety -DMEMLIM=60000 -collapse $1
13 | #spin -search -O2 -safety -DMEMLIM=60000 -hc $1
14 |
15 | # Bitstate verification - most aggressive state compression.
16 | # -w defines the power of two of the hash table size in bits.
17 | # examples: -w28: 32MB, -w33: 1GB, -w38: 32GB
18 | #spin -search -O2 -safety -bitstate -w28 $1
19 | spin -search -O2 -safety -bitstate -w38 $1
20 |
21 |
--------------------------------------------------------------------------------
/tools/qsc/.gitignore:
--------------------------------------------------------------------------------
1 | qsc
2 |
--------------------------------------------------------------------------------
/tools/qsc/group.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "context"
5 | "errors"
6 | "net/url"
7 | "strings"
8 |
9 | "github.com/bford/cofo/cri"
10 |
11 | "github.com/dedis/tlc/go/lib/cas"
12 | "github.com/dedis/tlc/go/lib/fs/casdir"
13 | "github.com/dedis/tlc/go/model/qscod/qscas"
14 | )
15 |
16 | // Group represents a QSC consensus group.
17 | // XXX move to a suitable generic package.
18 | type group struct {
19 | qscas.Group
20 | }
21 |
22 | // Open a consensus group identified by the resource identifier ri.
23 | // Creates the group if create is true; otherwise opens existing group state.
24 | //
25 | // Supports composable resource identifier (CRI) as preferred group syntax
26 | // because CRIs cleanly suppport nesting of resource identifiers.
27 | //
28 | func (g *group) Open(ctx context.Context, ri string, create bool) error {
29 |
30 | // Parse the group resource identifier into individual members
31 | paths, err := parseGroupRI(ri)
32 | if err != nil {
33 | return err
34 | }
35 | n := len(paths) // number of members in the consensus group
36 |
37 | // Create a POSIX directory-based CAS interface to each store
38 | stores := make([]cas.Store, n)
39 | for i, path := range paths {
40 | st := &casdir.Store{}
41 | if err := st.Init(path, create, create); err != nil {
42 | return err
43 | }
44 | stores[i] = st
45 | }
46 |
47 | // Start a CAS-based consensus group across this set of stores,
48 | // with the default threshold configuration.
49 | // (XXX make this configurable eventually.)
50 | g.Group.Start(ctx, stores, -1)
51 |
52 | return nil
53 | }
54 |
55 | // Parse a group resource identifier into individual member identifiers.
56 | func parseGroupRI(group string) ([]string, error) {
57 |
58 | // Allow just '[...]' as a command-line shorthand for 'qsc[...]'
59 | if len(group) > 0 && group[0] == '[' {
60 | group = "qsc" + group
61 | }
62 |
63 | // Parsing it as an actual CRI/URI is kind of unnecessary so far,
64 | // but may get more interesting with query-string options and such.
65 | rawurl, err := cri.URI.From(group)
66 | if err != nil {
67 | return nil, err
68 | }
69 | //println("rawurl:", rawurl)
70 | url, err := url.Parse(rawurl)
71 | if err != nil {
72 | return nil, err
73 | }
74 | if url.Scheme != "qsc" {
75 | return nil, errors.New("consensus groups must use qsc scheme")
76 | }
77 |
78 | // Parse the nested member paths from the opaque string in the URL.
79 | str, path := url.Opaque, ""
80 | var paths []string
81 | for str != "" {
82 | if i := strings.IndexByte(str, ','); i >= 0 {
83 | path, str = str[:i], str[i+1:]
84 | } else {
85 | path, str = str, ""
86 | }
87 | paths = append(paths, path)
88 | }
89 | if len(paths) < 3 {
90 | return nil, errors.New(
91 | "consensus groups must have minimum three members")
92 | }
93 |
94 | return paths, nil
95 | }
96 |
--------------------------------------------------------------------------------
/tools/qsc/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | //"flag"
6 | //"log"
7 | "context"
8 | "os"
9 | )
10 |
11 | var verbose bool = false
12 |
13 | const usageStr = `
14 | The qsc command provides tools using Que Sera Consensus (QSC).
15 |
16 | Usage:
17 |
18 | qsc [arguments]
19 |
20 | The types of consensus groups are:
21 |
22 | string Consensus on simple strings
23 | git Consensus on Git repositories
24 | hg Consensus on Mercurial repositories
25 |
26 | Run qsc help for commands that apply to each type.
27 | `
28 |
29 | func usage(usageString string) {
30 | fmt.Println(usageString)
31 | os.Exit(1)
32 | }
33 |
34 | func main() {
35 | if len(os.Args) < 2 {
36 | usage(usageStr)
37 | }
38 |
39 | // Create a cancelable top-level context and cancel it when we're done,
40 | // to shut down asynchronous consensus access operations cleanly.
41 | ctx, cancel := context.WithCancel(context.Background())
42 | defer cancel()
43 |
44 | // Parse consensus group kind
45 | switch os.Args[1] {
46 | case "string":
47 | stringCommand(ctx, os.Args[2:])
48 | default:
49 | usage(usageStr)
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/tools/qsc/string.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "log"
7 | "os"
8 | )
9 |
10 | func stringCommand(ctx context.Context, args []string) {
11 | if len(args) == 0 {
12 | usage(stringUsageStr)
13 | }
14 | switch args[0] {
15 | case "init":
16 | stringInitCommand(ctx, args[1:])
17 | case "get":
18 | stringGetCommand(ctx, args[1:])
19 | case "set":
20 | stringSetCommand(ctx, args[1:])
21 | default:
22 | usage(stringUsageStr)
23 | }
24 | }
25 |
26 | const stringUsageStr = `
27 | Usage: qsc string [arguments]
28 |
29 | The commands for string-value consensus groups are:
30 |
31 | init initialize a new consensus group
32 | get output the current consensus state as a quoted string
33 | set change the consensus state via atomic compare-and-set
34 | `
35 |
36 | func stringInitCommand(ctx context.Context, args []string) {
37 | if len(args) != 1 {
38 | usage(stringInitUsageStr)
39 | }
40 |
41 | // Create the consensus group state on each member node
42 | var g group
43 | err := g.Open(ctx, args[0], true)
44 | if err != nil {
45 | log.Fatal(err)
46 | }
47 | }
48 |
49 | const stringInitUsageStr = `
50 | Usage: qsc string init
51 |
52 | where specifies the consensus group
53 | as a composable resource identifier (CRI).
54 | For example:
55 |
56 | qsc git init qsc[host1:path1,host2:path2,host3:path3]
57 | `
58 |
59 | func stringGetCommand(ctx context.Context, args []string) {
60 | if len(args) != 1 {
61 | usage(stringGetUsageStr)
62 | }
63 |
64 | // Open the file stores
65 | var g group
66 | err := g.Open(ctx, args[0], false)
67 | if err != nil {
68 | log.Fatal(err)
69 | }
70 |
71 | // Find a consensus view of the last known commit.
72 | ver, val, err := g.CompareAndSet(ctx, "", "")
73 | if err != nil {
74 | log.Fatal(err)
75 | }
76 |
77 | fmt.Printf("version %d state %q\n", ver, val)
78 | }
79 |
80 | const stringGetUsageStr = `
81 | Usage: qsc string get
82 |
83 | where specifies the consensus group.
84 | Reads and prints the version number and string last committed.
85 | `
86 |
87 | func stringSetCommand(ctx context.Context, args []string) {
88 | if len(args) != 3 {
89 | usage(stringSetUsageStr)
90 | }
91 |
92 | old := args[1]
93 | new := args[2]
94 | if new == "" {
95 | log.Fatal("The empty string is reserved for the starting state")
96 | }
97 |
98 | // Open the file stores
99 | var g group
100 | err := g.Open(ctx, args[0], false)
101 | if err != nil {
102 | log.Fatal(err)
103 | }
104 |
105 | // Invoke the request compare-and-set operation.
106 | ver, val, err := g.CompareAndSet(ctx, old, new)
107 | if err != nil {
108 | log.Fatal(err)
109 | }
110 |
111 | fmt.Printf("version %d state %q\n", ver, val)
112 |
113 | // Return success only if the next commit was what we wanted
114 | if val != new {
115 | os.Exit(1)
116 | }
117 | os.Exit(0)
118 | }
119 |
120 | const stringSetUsageStr = `
121 | Usage: qsc string set
122 |
123 | where:
124 | specifies the consensus group
125 | is the expected existing value string
126 | is the new value to set if it hasn't yet changed from
127 |
128 | Prints the version number and string last committed,
129 | regardless of success or failure.
130 | `
131 |
--------------------------------------------------------------------------------