├── .gitignore ├── .travis.yml ├── .vscode ├── bookmarks.json └── settings.json ├── LICENSE ├── Mutual-Exclusion ├── README.md ├── code │ ├── clock.go │ ├── clock_test.go │ ├── message.go │ ├── message_test.go │ ├── process.go │ ├── process_test.go │ ├── receivedTime.go │ ├── receivedTime_test.go │ ├── requestQueue.go │ ├── requestQueue_test.go │ ├── resource.go │ ├── resource_test.go │ ├── timestamp.go │ ├── timestamp_test.go │ ├── util.go │ └── util_test.go ├── mutual_exclusion_request_period.pptx ├── qna.md ├── request_period.png ├── spanner-osdi2012.pdf └── time-clocks.pdf ├── PoW.zip ├── README.md ├── Raft ├── 6.824-2018.zip ├── 6.824Lab2_Raft.html ├── 6.824Lab2_Raft_files │ └── style.css ├── README.md ├── code │ ├── config.go │ ├── labgob │ │ ├── labgob.go │ │ └── test_test.go │ ├── labrpc │ │ ├── labrpc.go │ │ └── test_test.go │ ├── persister.go │ ├── raft-API.go │ ├── raft-AppendEntries.go │ ├── raft-LogEntry.go │ ├── raft-Raft.go │ ├── raft-RequestVote.go │ ├── raft-method.go │ ├── raft-persist.go │ ├── raft-settings.go │ ├── raft-settings_test.go │ ├── raft-state.go │ ├── raft-state_test.go │ ├── test_test.go │ └── util.go └── raft-extended.pdf └── test.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.dll 4 | *.so 5 | *.dylib 6 | 7 | # Test binary, build with `go test -c` 8 | *.test 9 | 10 | # Output of the go coverage tool, specifically when used with LiteIDE 11 | *.out 12 | 13 | # Project-local glide cache, RE: https://github.com/Masterminds/glide/issues/736 14 | .glide/ 15 | 16 | output.*.txt -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | 3 | go: 4 | - 1.13.x 5 | 6 | # whitelist 7 | branches: 8 | only: 9 | - master 10 | - stable 11 | 12 | script: 13 | - go get -t -v ./... 14 | - go vet ./... 15 | - bash ./test.sh 16 | 17 | after_success: 18 | - bash <(curl -s https://codecov.io/bash) -------------------------------------------------------------------------------- /.vscode/bookmarks.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "path": "$ROOTPATH$/output.test.txt", 4 | "bookmarks": [ 5 | { 6 | "line": 3088, 7 | "column": 1, 8 | "label": "" 9 | }, 10 | { 11 | "line": 3154, 12 | "column": 1, 13 | "label": "" 14 | }, 15 | { 16 | "line": 11967, 17 | "column": 1, 18 | "label": "" 19 | }, 20 | { 21 | "line": 12053, 22 | "column": 1, 23 | "label": "" 24 | } 25 | ] 26 | }, 27 | { 28 | "path": "$ROOTPATH$/Raft/code/config.go", 29 | "bookmarks": [ 30 | { 31 | "line": 470, 32 | "column": 11, 33 | "label": "" 34 | } 35 | ] 36 | } 37 | ] -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "cSpell.words": [ 3 | "Deserialize", 4 | "Jeiwan", 5 | "TXOs", 6 | "Txid", 7 | "UTXOs", 8 | "Vout", 9 | "abcdefghijklmn", 10 | "blockchain", 11 | "boltdb", 12 | "cbtx", 13 | "deserializes", 14 | "labrpc", 15 | "priv", 16 | "rcvr", 17 | "replyv" 18 | ] 19 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 aQua Yi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Mutual-Exclusion/README.md: -------------------------------------------------------------------------------- 1 | # Mutual Exclusion Algorithm Demo 2 | 3 | 使用 Go 语言实现了 Lamport 在论文 [《Time, Clocks and the Ordering of Events in a Distributed System》](time-clocks.pdf)中提到的 Mutual Exclusion 算法。 4 | 5 | ## 问题 6 | 7 | 多个 process 组成分享同一个 resource,但 resource 最多只能被一个 process 占用。由于 process 是分布式的,只能通过各自的 clock 读取时间值,这些 clock 的时间值不一定同步,没有办法通过时间上的编排来分别占用 resource。需要靠算法满足以下要求: 8 | 9 | 1. 对于 resource,一定要先释放,再占用。 10 | 1. 对于 process, 先申请,先占用。 11 | 1. 如果 process 占用 resource 的时间有限,那么,所有占用 resource 的申请,都会被满足。 12 | 13 | 为了简化问题,还存在以下假设: 14 | 15 | 1. 任意两个 process 都可以直接相互发送消息 16 | 1. 对于任意两个 process Pi 和 Pj 而言,从 Pi 发往 Pj 的消息,满足先发送先到达的原则 17 | 1. process 间发送的消息,一定会收到 18 | 19 | ## 从局部排序到全局排序 20 | 21 | 在展开之前,先强调几个定义: 22 | 23 | 1. 多个 process 中第 i 个 process 标记为 Pi 24 | 1. process 由一系列 event 组成,第 j 个 event 标记为 Ej 25 | 1. 每个 process 都有一个 clock 用于标记 event 发生的时间。第 i 个 process 发生第 j 个 event 的时间,标记为 Ci(Ej) 26 | 1. 每个 process 都是 **串行** 的 27 | 1. process 之间可以通过 send 和 receive message 来直接通信。send 和 receive 是两个 process 的单独事件。 28 | 29 | ### "happened before" 30 | 31 | "happened before" 表示一个局部排序关系,有两种情况下成立 32 | 33 | 1. 串行的 Pm 中, Ei 比 Ej 早发生。 Ei "happened before" Ej,所以有 Cm(Ei) < Cm(Ej)。 34 | 1. 从 Pm 发送到 Pn 中的消息 message,Pm 中 Ei 是发送 message, Pn 中 Ej 是接受 message。Ei "happened before" Ej,所以有 Cm(Ei) < Cn(Ej) 35 | 36 | 以上两条,在论文中被称为 `Clock Condition`。 37 | 38 | ### Lamport timestamps 39 | 40 | 为了让 system 中的 clocks 满足 `Clock Condition`,论文上的规定了 IR1 和 IR2,并在最后演变成了 [Lamport timestamps](https://en.wikipedia.org/wiki/Lamport_timestamps) 规则: 41 | 42 | 1. 进程在每做一件事情之前,计数器+1 43 | 1. 当进程发送消息的时候,需要带上计数器的值 44 | 1. 当进程接收消息的时候,需要根据消息中的值,更新自己的计数器。更新规则为 max(自身值,消息值)+1 45 | 46 | 以下是规则的伪代码 47 | 48 | ```code 49 | // 在进程内 50 | time_stamp = time_stamp + 1 51 | doOneEvent() 52 | 53 | // 进程发现消息时 54 | time = time + 1 55 | time_stamp = time 56 | send(message, time_stamp) 57 | 58 | // 进程接收消息时 59 | (message, time_stamp) = receive() 60 | time = max(time_stamp, time) + 1 61 | ``` 62 | 63 | > 基维百科上的说法和论文的说法,略有不同。我的代码以论文为准。 64 | 65 | ### 全局排序 66 | 67 | `Ei => Ej` 表示,在全局排序中, Ei 排在 Ej 前面。 68 | 69 | 对于 system 中的任意一个 event,可以使用其所在的 process P 和发生的 timestamp T 进行编号为: ``。 70 | 71 | 任意两个事件 Ei`` 和 Ej``, 若要使得 `Ei => Ej` 成立,需要以下两个条件之一成立: 72 | 73 | 1. Tm < Tn 74 | 1. Tm == Tn 且 Pa < Pb 75 | 76 | 其中 Pa < Pb 的含义是, system 中 process 中也存在一种排序方式。我在代码中选择使用 process 的代号,对其进行排序。 77 | 78 | ## mutual exclusion 算法 79 | 80 | mutual exclusion 算法需要每个 process 维护自己的 request queue。 由 5 个规则组成 81 | 82 | 1. 为了申请 resource,process Pi 需要 83 | 1. 生成 request `` 84 | 1. 发送 request message `` 到所有其他的 process 85 | 1. 把 `` 放入自己的 request queue 86 | 1. 当 Pj 收到 request message `` 后 87 | 1. 把 `` 放入自己的 request queue 88 | 1. 回复 Pi 一条 acknowledge message,确认收到。 89 | 1. 为了释放 resource,process Pi 需要 90 | 1. 释放 resource 91 | 1. 把 `` 移出自己的 request queue 92 | 1. 发送 release message `` 到所有其他的 process 93 | 1. 当 Pj 收到 release message `` 后 94 | 1. 把 `` 移出自己的 request queue 95 | 1. 当以下全部条件满足时,Pi 可以占用 resource: 96 | 1. 在 Pi 的 request queue 中,`` 与其他 event 都是 `=>` 关系。 97 | 1. Pi 收到所有其他 process 消息的最新时间中的最小值 > Tm 98 | 99 | 每个 process 只需要独立平等地处理这 5 种 event,就可以避免 process 同时占用 resource 的情况。 100 | 101 | 以上 5 个规则,是从 process 之间交互的角度来规定的。如果把 request `` 的整个生命周期放在 Pi 的时间轴上。如下图所示 102 | 103 | ![request周期](request_period.png) 104 | 105 | 从图中可以看到 106 | 107 | 1. 资源占用期两边是 `` 加入和退出 Pi.requestQueue 108 | 1. 操作 Pi.requestQueue 的两边是,给其他 process 发送消息的时间 109 | 110 | 这个顺序很重要,因为这个顺序保证了 `` 在 Pi 中满足 Rule5 的时候,`` 在所有的 process 的 request queue 中都是全局排序排在最前面的。所以 Pi 才能大胆地占用 resource,而不担心重复。 111 | 112 | ## 总结 113 | 114 | 由 lamport timestamps 规则和 process 排序,可以得到 system 内所有 event 的一种全局排序。request event 是全部 event 的子集,因此也可以全局排序。resource 占用顺序与其排序顺序一致。因此 mutual exclusion 算法能够满足要求。 115 | 116 | ## 思考问题 117 | 118 | 1. 为什么会出现多种全局排序?请举例说明。 119 | 1. 真实时间上先 request 的 process 会不会后得到 resource?如果会的话,能不能说明 mutual exclusion 算法失败了?请说明理由。 120 | 121 | [参考答案](qna.md) 122 | -------------------------------------------------------------------------------- /Mutual-Exclusion/code/clock.go: -------------------------------------------------------------------------------- 1 | package mutualexclusion 2 | 3 | import ( 4 | "math/rand" 5 | "sync" 6 | ) 7 | 8 | // Clock 是系统的逻辑时钟接口 9 | type Clock interface { 10 | // Update 根据输入参数更新自身的值 11 | Update(int) 12 | // Tick 时钟跳动一次,并返回最新的时间值 13 | Tick() int 14 | // Now 返回当前的时间值 15 | Now() int 16 | } 17 | 18 | type clock struct { 19 | time int 20 | rwmu sync.RWMutex 21 | } 22 | 23 | // 每个 process 的 clock 的 initial time,都是随机的 24 | func newClock() Clock { 25 | return &clock{ 26 | time: 1 + rand.Intn(100), 27 | } 28 | } 29 | 30 | func (c *clock) Update(t int) { 31 | c.rwmu.Lock() 32 | c.time = max(c.time, t+1) 33 | c.rwmu.Unlock() 34 | } 35 | 36 | func (c *clock) Tick() int { 37 | c.rwmu.Lock() 38 | c.time++ 39 | t := c.time 40 | c.rwmu.Unlock() 41 | return t 42 | } 43 | 44 | func (c *clock) Now() int { 45 | c.rwmu.RLock() 46 | t := c.time 47 | c.rwmu.RUnlock() 48 | return t 49 | } 50 | -------------------------------------------------------------------------------- /Mutual-Exclusion/code/clock_test.go: -------------------------------------------------------------------------------- 1 | package mutualexclusion 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func Test_clock_update(t *testing.T) { 10 | ast := assert.New(t) 11 | // 12 | c := newClock() 13 | newTime := 1000 14 | ast.True(newTime+1 >= c.Now()) 15 | // 16 | c.Update(newTime) 17 | // 18 | expected := newTime + 1 19 | actual := c.Now() 20 | ast.Equal(expected, actual) 21 | } 22 | 23 | func Test_clock_tick(t *testing.T) { 24 | ast := assert.New(t) 25 | // 26 | c := newClock() 27 | expected := c.Now() + 1 28 | actual := c.Tick() 29 | ast.Equal(expected, actual) 30 | } 31 | -------------------------------------------------------------------------------- /Mutual-Exclusion/code/message.go: -------------------------------------------------------------------------------- 1 | package mutualexclusion 2 | 3 | import "fmt" 4 | 5 | type message struct { 6 | msgType msgType 7 | from int // message 发送方的 ID 8 | to int // message 接收方的 ID, 当值为 OTHERS 的时候,表示接收方为除 from 外的所有 9 | timestamp Timestamp 10 | msgTime int 11 | } 12 | 13 | func newMessage(mt msgType, msgTime, from, to int, ts Timestamp) *message { 14 | return &message{ 15 | msgType: mt, 16 | msgTime: msgTime, 17 | from: from, 18 | to: to, 19 | timestamp: ts, 20 | } 21 | } 22 | 23 | func (m *message) String() string { 24 | return fmt.Sprintf("{%s, Time:%d, From:%d, To:%2d, %s}", m.msgType, m.msgTime, m.from, m.to, m.timestamp) 25 | } 26 | 27 | type msgType int 28 | 29 | // 枚举了 message 的所有类型 30 | const ( 31 | // REQUEST_RESOURCE 请求资源 32 | requestResource msgType = iota 33 | releaseResource 34 | acknowledgment 35 | ) 36 | 37 | func (mt msgType) String() string { 38 | switch mt { 39 | case requestResource: 40 | return "申请" 41 | case releaseResource: 42 | return "释放" 43 | default: 44 | return "确认" 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /Mutual-Exclusion/code/message_test.go: -------------------------------------------------------------------------------- 1 | package mutualexclusion 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func Test_Message(t *testing.T) { 10 | ast := assert.New(t) 11 | // 12 | ts := newTimestamp(0, 0) 13 | m := newMessage(requestResource, 0, 0, OTHERS, ts) 14 | // 15 | expected := "{申请, Time:0, From:0, To:-1, }" 16 | actual := m.String() 17 | ast.Equal(expected, actual) 18 | // 19 | m.msgType = releaseResource 20 | expected = "{释放, Time:0, From:0, To:-1, }" 21 | actual = m.String() 22 | ast.Equal(expected, actual) 23 | // 24 | m.msgType = acknowledgment 25 | expected = "{确认, Time:0, From:0, To:-1, }" 26 | actual = m.String() 27 | ast.Equal(expected, actual) 28 | // 29 | } 30 | -------------------------------------------------------------------------------- /Mutual-Exclusion/code/process.go: -------------------------------------------------------------------------------- 1 | package mutualexclusion 2 | 3 | import ( 4 | "fmt" 5 | "sync" 6 | 7 | "github.com/aQuaYi/observer" 8 | ) 9 | 10 | // OTHERS 表示信息接收方为其他所有 process 11 | const OTHERS = -1 12 | 13 | // Process 是进程的接口 14 | type Process interface { 15 | // Request 会申请占用资源 16 | // 如果上次 Request 后,还没有占用并释放资源,会发生阻塞 17 | // 非线程安全 18 | Request() 19 | } 20 | 21 | type process struct { 22 | me int // process 的 ID 23 | wg sync.WaitGroup // 阻塞 Request() 用 24 | 25 | clock Clock 26 | resource Resource 27 | receivedTime ReceivedTime 28 | requestQueue RequestQueue 29 | 30 | mutex sync.Mutex 31 | // 为了保证发送消息的原子性, 32 | // 从生成 timestamp 开始到 prop.update 完成,这个过程需要上锁 33 | prop observer.Property 34 | // 操作以下属性,需要加锁 35 | isOccupying bool 36 | requestTimestamp Timestamp 37 | } 38 | 39 | func (p *process) String() string { 40 | return fmt.Sprintf("[%d]P%d", p.clock.Now(), p.me) 41 | } 42 | 43 | func newProcess(all, me int, r Resource, prop observer.Property) Process { 44 | p := &process{ 45 | me: me, 46 | resource: r, 47 | prop: prop, 48 | clock: newClock(), 49 | requestQueue: newRequestQueue(), 50 | receivedTime: newReceivedTime(all, me), 51 | } 52 | 53 | p.Listening() 54 | 55 | debugPrintf("%s 完成创建工作", p) 56 | 57 | return p 58 | } 59 | 60 | func (p *process) Listening() { 61 | // stream 的观察起点位置,由上层调用 newProcess 的方式决定 62 | // 在生成完所有的 process 后,再更新 prop, 63 | // 才能保证所有的 process 都能收到全部消息 64 | stream := p.prop.Observe() 65 | 66 | debugPrintf("%s 获取了 stream 开始监听", p) 67 | 68 | go func() { 69 | for { 70 | msg := stream.Next().(*message) 71 | if msg.from == p.me || 72 | (msg.msgType == acknowledgment && msg.to != p.me) { 73 | // 忽略不该看见的消息 74 | continue 75 | } 76 | 77 | p.updateTime(msg.from, msg.msgTime) 78 | 79 | switch msg.msgType { 80 | // case acknowledgment: 收到此类消息只用更新时钟,前面已经做了 81 | case requestResource: 82 | p.handleRequestMessage(msg) 83 | case releaseResource: 84 | p.handleReleaseMessage(msg) 85 | } 86 | p.checkRule5() 87 | } 88 | }() 89 | } 90 | 91 | func (p *process) updateTime(from, time int) { 92 | p.mutex.Lock() 93 | 94 | // 收到消息的第一件,更新自己的 clock 95 | p.clock.Update(time) 96 | // 然后为了 Rule5(ii) 记录收到消息的时间 97 | // NOTICE: 接收时间一定要是对方发出的时间 98 | p.receivedTime.Update(from, time) 99 | 100 | p.mutex.Unlock() 101 | } 102 | 103 | func (p *process) handleRequestMessage(msg *message) { 104 | 105 | // rule 2.1: 把 msg.timestamp 放入自己的 requestQueue 当中 106 | p.requestQueue.Push(msg.timestamp) 107 | 108 | debugPrintf("%s 添加了 %s 后的 request queue 是 %s", p, msg.timestamp, p.requestQueue) 109 | 110 | p.mutex.Lock() 111 | 112 | // rule 2.2: 给对方发送一条 acknowledge 消息 113 | p.prop.Update(newMessage( 114 | acknowledgment, 115 | p.clock.Tick(), 116 | p.me, 117 | msg.from, 118 | msg.timestamp, 119 | )) 120 | 121 | p.mutex.Unlock() 122 | } 123 | 124 | func (p *process) handleReleaseMessage(msg *message) { 125 | // rule 4: 从 request queue 中删除相应的申请 126 | p.requestQueue.Remove(msg.timestamp) 127 | debugPrintf("%s 删除了 %s 后的 request queue 是 %s", p, msg.timestamp, p.requestQueue) 128 | } 129 | 130 | func (p *process) checkRule5() { 131 | p.mutex.Lock() 132 | if p.isSatisfiedRule5() { 133 | p.occupyResource() 134 | go func() { 135 | // process 释放资源的时机交给 goroutine 调度 136 | p.releaseResource() 137 | }() 138 | } 139 | p.mutex.Unlock() 140 | } 141 | 142 | func (p *process) isSatisfiedRule5() bool { 143 | // 利用 checkRule5 的锁进行锁定 144 | return !p.isOccupying && // 还没有占领资源 145 | p.requestTimestamp != nil && // 已经申请资源 146 | p.requestTimestamp.IsEqual(p.requestQueue.Min()) && // Rule5.1 申请排在第一位 147 | p.requestTimestamp.IsBefore(p.receivedTime.Min()) // Rule5.2: 申请后,收到全部回复 148 | } 149 | 150 | func (p *process) occupyResource() { 151 | // 利用 checkRule5 的锁进行锁定 152 | debugPrintf("%s 准备占用资源 %s", p, p.requestQueue) 153 | p.isOccupying = true 154 | p.resource.Occupy(p.requestTimestamp) 155 | } 156 | 157 | func (p *process) releaseResource() { 158 | p.mutex.Lock() 159 | 160 | ts := p.requestTimestamp 161 | // rule 3: 先释放资源 162 | p.resource.Release(ts) 163 | // rule 3: 在 requestQueue 中删除 ts 164 | p.requestQueue.Remove(ts) 165 | // rule 3: 把释放的消息发送给其他 process 166 | msg := newMessage(releaseResource, p.clock.Tick(), p.me, OTHERS, ts) 167 | p.prop.Update(msg) 168 | p.isOccupying = false 169 | p.requestTimestamp = nil 170 | 171 | p.mutex.Unlock() 172 | 173 | p.wg.Done() 174 | } 175 | 176 | func (p *process) Request() { 177 | p.wg.Wait() 178 | p.wg.Add(1) 179 | 180 | p.mutex.Lock() 181 | 182 | p.clock.Tick() // 做事之前,先更新 clock 183 | ts := newTimestamp(p.clock.Now(), p.me) 184 | msg := newMessage(requestResource, p.clock.Now(), p.me, OTHERS, ts) 185 | // Rule 1.1: 发送申请信息给其他的 process 186 | p.prop.Update(msg) 187 | // Rule 1.2: 把申请消息放入自己的 request queue 188 | p.requestQueue.Push(ts) 189 | // 修改辅助属性,便于后续检查 190 | p.requestTimestamp = ts 191 | 192 | p.mutex.Unlock() 193 | } 194 | -------------------------------------------------------------------------------- /Mutual-Exclusion/code/process_test.go: -------------------------------------------------------------------------------- 1 | package mutualexclusion 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "testing" 7 | 8 | "github.com/aQuaYi/observer" 9 | "github.com/stretchr/testify/assert" 10 | ) 11 | 12 | func run(all, occupyTimesPerProcess int) { 13 | rsc := newResource(all * occupyTimesPerProcess) 14 | 15 | prop := observer.NewProperty(nil) 16 | 17 | ps := make([]Process, all) 18 | // 需要一口气同时生成,保证所有的 stream 都能从同样的位置开始观察 19 | for i := range ps { 20 | p := newProcess(all, i, rsc, prop) 21 | ps[i] = p 22 | } 23 | debugPrintf("~~~ 已经成功创建了 %d 个 Process ~~~", all) 24 | 25 | stream := prop.Observe() 26 | go func() { 27 | for { 28 | msg := stream.Next().(*message) 29 | debugPrintf(" ## %s", msg) 30 | } 31 | }() 32 | 33 | for _, p := range ps { 34 | go func(p Process, times int) { 35 | i := 0 36 | debugPrintf("%s 开始申请资源", p) 37 | for i < times { 38 | p.Request() 39 | i++ 40 | } 41 | }(p, occupyTimesPerProcess) 42 | } 43 | 44 | rsc.wait() 45 | 46 | log.Println(rsc.report()) 47 | } 48 | 49 | func Test_process(t *testing.T) { 50 | ast := assert.New(t) 51 | // 52 | amount := 131072 53 | for all := 2; all <= 64; all *= 2 { 54 | times := amount / all 55 | name := fmt.Sprintf("%d Process × %d 次 = 共计 %d 次", all, times, amount) 56 | t.Run(name, func(t *testing.T) { 57 | ast.NotPanics(func() { 58 | run(all, times) 59 | }) 60 | }) 61 | } 62 | } 63 | 64 | func Test_process_String(t *testing.T) { 65 | ast := assert.New(t) 66 | // 67 | me := 1 68 | clock := newClock() 69 | p := &process{ 70 | me: me, 71 | clock: clock, 72 | } 73 | time := 999 74 | p.clock.Update(time) 75 | expected := fmt.Sprintf("[%d]P%d", time+1, me) 76 | actual := p.String() 77 | ast.Equal(expected, actual) 78 | } 79 | -------------------------------------------------------------------------------- /Mutual-Exclusion/code/receivedTime.go: -------------------------------------------------------------------------------- 1 | package mutualexclusion 2 | 3 | import ( 4 | "container/heap" 5 | "sync" 6 | ) 7 | 8 | // ReceivedTime 是最新接受时间的操作接口 9 | // 因为 Rule5(ii) 需要用到最小的接受时间 10 | type ReceivedTime interface { 11 | // Update 更新从 process 接收到的最新时间 12 | Update(process, time int) 13 | // Min 返回从各个 process 接收时间的最小值 14 | Min() int 15 | } 16 | 17 | type receivedTime struct { 18 | trq *timeRecordQueue 19 | trs []*timeRecord 20 | mutex sync.Mutex 21 | } 22 | 23 | func newReceivedTime(all, me int) ReceivedTime { 24 | trq := new(timeRecordQueue) 25 | trs := make([]*timeRecord, all) 26 | for i := range trs { 27 | if i == me { 28 | continue 29 | } 30 | trs[i] = &timeRecord{} 31 | heap.Push(trq, trs[i]) 32 | } 33 | return &receivedTime{ 34 | trq: trq, 35 | trs: trs, 36 | } 37 | } 38 | 39 | func (rt *receivedTime) Update(id, time int) { 40 | rt.mutex.Lock() 41 | rt.trq.update(rt.trs[id], time) 42 | rt.mutex.Unlock() 43 | } 44 | 45 | // 返回 rt 中的最小值 46 | func (rt *receivedTime) Min() int { 47 | rt.mutex.Lock() 48 | defer rt.mutex.Unlock() 49 | return (*rt.trq)[0].time 50 | } 51 | 52 | // timeRecord 是 priorityQueue 中的元素 53 | type timeRecord struct { 54 | time int 55 | index int 56 | } 57 | 58 | type timeRecordQueue []*timeRecord 59 | 60 | func (trq timeRecordQueue) Len() int { return len(trq) } 61 | 62 | func (trq timeRecordQueue) Less(i, j int) bool { 63 | return trq[i].time < trq[j].time 64 | } 65 | 66 | func (trq timeRecordQueue) Swap(i, j int) { 67 | trq[i], trq[j] = trq[j], trq[i] 68 | trq[i].index = i 69 | trq[j].index = j 70 | } 71 | 72 | // Push 往 pq 中放 entry 73 | func (trq *timeRecordQueue) Push(x interface{}) { 74 | temp := x.(*timeRecord) 75 | temp.index = len(*trq) 76 | *trq = append(*trq, temp) 77 | } 78 | 79 | // Pop 从 pq 中取出最优先的 entry 80 | func (trq *timeRecordQueue) Pop() interface{} { 81 | temp := (*trq)[len(*trq)-1] 82 | temp.index = -1 // for safety 83 | *trq = (*trq)[0 : len(*trq)-1] 84 | return temp 85 | } 86 | 87 | func (trq *timeRecordQueue) update(tr *timeRecord, time int) { 88 | tr.time = time 89 | heap.Fix(trq, tr.index) 90 | } 91 | -------------------------------------------------------------------------------- /Mutual-Exclusion/code/receivedTime_test.go: -------------------------------------------------------------------------------- 1 | package mutualexclusion 2 | 3 | import ( 4 | "container/heap" 5 | "testing" 6 | 7 | "github.com/stretchr/testify/assert" 8 | ) 9 | 10 | func Test_receivedTime_update(t *testing.T) { 11 | ast := assert.New(t) 12 | all, me := 10, 0 13 | rt := newReceivedTime(all, me) 14 | // 把所有的接受值调整到较大的值 15 | for i := 1; i < all; i++ { 16 | rt.Update(i, all+1) 17 | } 18 | // 依次按照以最小值更新第 i 个时间值 19 | for i := all - 1; i > me; i-- { 20 | expected := i 21 | rt.Update(i, i) 22 | actual := rt.Min() 23 | ast.Equal(expected, actual) 24 | } 25 | } 26 | 27 | func Test_receivedTime_updateItselfWillPanic(t *testing.T) { 28 | ast := assert.New(t) 29 | all, me := 10, 0 30 | rt := newReceivedTime(all, me) 31 | ast.Panics(func() { rt.Update(me, 1) }) 32 | } 33 | 34 | func Test_timeRecordQueue_Pop(t *testing.T) { 35 | ast := assert.New(t) 36 | trq := new(timeRecordQueue) 37 | expected := &timeRecord{ 38 | time: 1, 39 | } 40 | heap.Push(trq, expected) 41 | actual := heap.Pop(trq).(*timeRecord) 42 | ast.Equal(expected.time, actual.time) 43 | } 44 | -------------------------------------------------------------------------------- /Mutual-Exclusion/code/requestQueue.go: -------------------------------------------------------------------------------- 1 | package mutualexclusion 2 | 3 | import ( 4 | "container/heap" 5 | "strings" 6 | "sync" 7 | ) 8 | 9 | // RequestQueue 提供了操作 request queue 的接口 10 | type RequestQueue interface { 11 | // Min 返回最小的 Timestamp 值 12 | Min() Less 13 | // Push 把元素加入 RequestQueue 中 14 | Push(Less) 15 | // Remove 在 RequestQueue 中删除 Less 16 | Remove(Less) 17 | // String 输出 RequestQueue 的细节 18 | String() string 19 | } 20 | 21 | type requestQueue struct { 22 | rpq *requestPriorityQueue 23 | requestOf map[Less]*request 24 | mutex sync.Mutex 25 | } 26 | 27 | func newRequestQueue() RequestQueue { 28 | return &requestQueue{ 29 | rpq: new(requestPriorityQueue), 30 | requestOf: make(map[Less]*request, 1024), 31 | } 32 | } 33 | 34 | func (rq *requestQueue) Min() Less { 35 | rq.mutex.Lock() 36 | defer rq.mutex.Unlock() 37 | if len(*rq.rpq) == 0 { 38 | return nil 39 | } 40 | return (*rq.rpq)[0].ls 41 | } 42 | 43 | func (rq *requestQueue) Push(ls Less) { 44 | rq.mutex.Lock() 45 | r := &request{ 46 | ls: ls, 47 | } 48 | 49 | rq.requestOf[ls] = r 50 | heap.Push(rq.rpq, r) 51 | rq.mutex.Unlock() 52 | } 53 | 54 | func (rq *requestQueue) Remove(ls Less) { 55 | rq.mutex.Lock() 56 | rq.rpq.remove(rq.requestOf[ls]) 57 | delete(rq.requestOf, ls) 58 | rq.mutex.Unlock() 59 | } 60 | 61 | func (rq *requestQueue) String() string { 62 | return rq.rpq.String() 63 | } 64 | 65 | // Less 是 rpq 元素中的主要成分 66 | type Less interface { 67 | // Less 比较两个接口的值 68 | Less(interface{}) bool 69 | // String() 输出内容 70 | String() string 71 | } 72 | 73 | // request 是 priorityQueue 中的元素 74 | type request struct { 75 | ls Less 76 | index int 77 | } 78 | 79 | // rpq implements heap.Interface and holds entries. 80 | type requestPriorityQueue []*request 81 | 82 | func (q *requestPriorityQueue) String() string { 83 | var b strings.Builder 84 | b.WriteString("{request queue:") 85 | for i := range *q { 86 | b.WriteString((*q)[i].ls.String()) 87 | } 88 | b.WriteString("}") 89 | return b.String() 90 | } 91 | 92 | func (q requestPriorityQueue) Len() int { return len(q) } 93 | 94 | func (q requestPriorityQueue) Less(i, j int) bool { 95 | return q[i].ls.Less(q[j].ls) 96 | } 97 | 98 | func (q requestPriorityQueue) Swap(i, j int) { 99 | q[i], q[j] = q[j], q[i] 100 | q[i].index = i 101 | q[j].index = j 102 | } 103 | 104 | // Push 往 pq 中放 entry 105 | func (q *requestPriorityQueue) Push(x interface{}) { 106 | temp := x.(*request) 107 | temp.index = len(*q) 108 | *q = append(*q, temp) 109 | } 110 | 111 | // Pop 从 pq 中取出最优先的 entry 112 | func (q *requestPriorityQueue) Pop() interface{} { 113 | temp := (*q)[len(*q)-1] 114 | temp.index = -1 // for safety 115 | *q = (*q)[0 : len(*q)-1] 116 | return temp 117 | } 118 | 119 | func (q *requestPriorityQueue) remove(r *request) { 120 | heap.Remove(q, r.index) 121 | } 122 | -------------------------------------------------------------------------------- /Mutual-Exclusion/code/requestQueue_test.go: -------------------------------------------------------------------------------- 1 | package mutualexclusion 2 | 3 | import ( 4 | "strings" 5 | "testing" 6 | 7 | "github.com/stretchr/testify/assert" 8 | ) 9 | 10 | func makeIncreasingTimestamps(half int) []Timestamp { 11 | res := make([]Timestamp, 0, half*2) 12 | for i := 0; i < half; i++ { 13 | res = append(res, 14 | newTimestamp(i, i*2), 15 | newTimestamp(i, i*2+1), 16 | ) 17 | } 18 | return res 19 | } 20 | 21 | func Test_requestQueue(t *testing.T) { 22 | ast := assert.New(t) 23 | // 24 | half := 10 25 | size := half * 2 26 | tss := makeIncreasingTimestamps(half) 27 | rq := newRequestQueue() 28 | // 29 | for i := size - 1; i >= 0; i-- { 30 | ts := tss[i] 31 | rq.Push(ts) // 每次放入到都是新的最小值 32 | expected := ts 33 | actual := rq.Min() 34 | ast.Equal(expected, actual) 35 | } 36 | // 37 | for i := 0; i+1 < size; i++ { 38 | rq.Remove(tss[i]) 39 | expected := tss[i+1] // 删除了最小值后,下个就是新的最小值 40 | actual := rq.Min() 41 | ast.Equal(expected, actual) 42 | } 43 | } 44 | 45 | func Test_requestQueue_remove(t *testing.T) { 46 | ast := assert.New(t) 47 | // 48 | half := 10 49 | size := half * 2 50 | tss := makeIncreasingTimestamps(half) 51 | rq := newRequestQueue() 52 | // 53 | for i := 0; i < size; i++ { 54 | ts := tss[i] 55 | rq.Push(ts) 56 | } 57 | // 58 | expected := tss[0] 59 | for i, j := 1, size-1; i < j; i, j = i+1, j-1 { 60 | rq.Remove(tss[i]) 61 | actual := rq.Min() 62 | ast.Equal(expected, actual) 63 | // 64 | rq.Remove(tss[j]) 65 | actual = rq.Min() 66 | ast.Equal(expected, actual) 67 | } 68 | } 69 | 70 | func Test_requestQueue_MinOfEmpty(t *testing.T) { 71 | ast := assert.New(t) 72 | rq := newRequestQueue() 73 | ast.Nil(rq.Min()) 74 | } 75 | 76 | func Test_requestQueue_String(t *testing.T) { 77 | ast := assert.New(t) 78 | size := 100 79 | // 创建 timestamps 80 | timestamps := make([]Timestamp, 0, size) 81 | for i := 1; i < size; i++ { 82 | timestamps = append(timestamps, newTimestamp(i, i)) 83 | } 84 | // 创建 requestQueue,并添加 timestamp 85 | rq := newRequestQueue() 86 | for i := range timestamps { 87 | rq.Push(timestamps[i]) 88 | } 89 | // 获取 rq 的字符输出 90 | rqs := rq.String() 91 | // 验证 rqs 中的内容 92 | for i := range timestamps { 93 | tss := timestamps[i].String() 94 | ast.True(strings.Contains(rqs, tss)) 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /Mutual-Exclusion/code/resource.go: -------------------------------------------------------------------------------- 1 | package mutualexclusion 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | "sync" 7 | "time" 8 | 9 | "github.com/montanaflynn/stats" 10 | ) 11 | 12 | // Resource 是 Process 占用资源的接口 13 | type Resource interface { 14 | // Occupy 表示占用资源 15 | Occupy(Timestamp) 16 | // Release 表示释放资源 17 | Release(Timestamp) 18 | } 19 | 20 | type resource struct { 21 | lastOccupiedBy Timestamp // 记录上次占用资源的 timestamp 22 | occupiedBy Timestamp // 记录当前占用资源的 timestamp, nil 表示资源未被占用 23 | timestamps []Timestamp // 按顺序保存占用资源的 timestamp 24 | times []time.Time // 记录每次占用资源的起止时间,用于分析算法的效率 25 | wg sync.WaitGroup // 完成全部占用前,阻塞主 goroutine 26 | } 27 | 28 | func newResource(times int) *resource { 29 | r := &resource{ 30 | lastOccupiedBy: newTimestamp(-1, -1), 31 | } 32 | r.wg.Add(times) 33 | return r 34 | } 35 | 36 | func (r *resource) wait() { 37 | r.wg.Wait() 38 | } 39 | 40 | func (r *resource) Occupy(ts Timestamp) { 41 | r.times = append(r.times, time.Now()) 42 | 43 | if r.occupiedBy != nil { 44 | msg := fmt.Sprintf("资源正在被 %s 占据,%s 却想获取资源。", r.occupiedBy, ts) 45 | panic(msg) 46 | } 47 | 48 | if !r.lastOccupiedBy.Less(ts) { 49 | msg := fmt.Sprintf("资源上次被 %s 占据,这次 %s 却想占据资源。", r.lastOccupiedBy, ts) 50 | panic(msg) 51 | } 52 | 53 | r.occupiedBy = ts 54 | r.timestamps = append(r.timestamps, ts) 55 | debugPrintf("~~~ @resource: %s occupied ~~~ ", ts) 56 | } 57 | 58 | func (r *resource) Release(ts Timestamp) { 59 | if !r.occupiedBy.IsEqual(ts) { 60 | msg := fmt.Sprintf("%s 想要释放正在被 P%s 占据的资源。", ts, r.occupiedBy) 61 | panic(msg) 62 | } 63 | 64 | r.lastOccupiedBy, r.occupiedBy = ts, nil 65 | r.times = append(r.times, time.Now()) 66 | debugPrintf("~~~ @resource: %s released ~~~ ", ts) 67 | 68 | r.wg.Done() // 完成一次占用 69 | } 70 | 71 | func (r *resource) report() string { 72 | var b strings.Builder 73 | size := len(r.times) 74 | 75 | // 统计资源被占用的时间 76 | totalTime := r.times[size-1].Sub(r.times[0]) 77 | format := "resource 被占用了 %s, " 78 | fmt.Fprintf(&b, format, totalTime) 79 | 80 | // 计算占用率 81 | busys := make([]float64, 0, size/2) 82 | idles := make([]float64, 0, size/2) 83 | 84 | var i int 85 | for i = 0; i+2 < size; i += 2 { 86 | busys = append(busys, float64(r.times[i+1].Sub(r.times[i]).Nanoseconds())) 87 | idles = append(idles, float64(r.times[i+2].Sub(r.times[i+1]).Nanoseconds())) 88 | } 89 | busys = append(busys, float64(r.times[i+1].Sub(r.times[i]).Nanoseconds())) 90 | 91 | busy, _ := stats.Sum(busys) 92 | idle, _ := stats.Sum(idles) 93 | total := busy + idle 94 | rate := busy * 100 / total 95 | 96 | format = "占用比率为 %4.2f%%。\n" 97 | fmt.Fprintf(&b, format, rate) 98 | 99 | // 计算资源占用时间的均值和方差 100 | format = "资源占用: %s\n" 101 | fmt.Fprintf(&b, format, statisticAnalyze(busys)) 102 | 103 | // 计算资源空闲间隙的均值和方差 104 | format = "资源空闲: %s\n" 105 | fmt.Fprintf(&b, format, statisticAnalyze(idles)) 106 | 107 | return b.String() 108 | } 109 | 110 | func statisticAnalyze(floats []float64) string { 111 | format := "min %8.2fus, max %8.2fus, mean %8.2fus, sd %8.2f" 112 | min, _ := stats.Min(floats) 113 | max, _ := stats.Max(floats) 114 | mean, _ := stats.Mean(floats) 115 | sd, _ := stats.StandardDeviation(floats) 116 | return fmt.Sprintf(format, min/1000, max/1000, mean/1000, sd/1000) 117 | } 118 | -------------------------------------------------------------------------------- /Mutual-Exclusion/code/resource_test.go: -------------------------------------------------------------------------------- 1 | package mutualexclusion 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | "testing" 7 | "time" 8 | 9 | "github.com/stretchr/testify/assert" 10 | ) 11 | 12 | func Test_resource_occupyAndRelease(t *testing.T) { 13 | ast := assert.New(t) 14 | // 15 | p := 0 16 | ts := newTimestamp(0, p) 17 | r := newResource(1) 18 | // 占用 19 | r.Occupy(ts) 20 | ast.Equal(ts, r.occupiedBy) 21 | // 释放 22 | r.Release(ts) 23 | r.wait() 24 | ast.Equal(ts, r.lastOccupiedBy) 25 | ast.Equal(ts, r.timestamps[0]) 26 | } 27 | 28 | func Test_resource_occupy_occupyInvalidResource(t *testing.T) { 29 | ast := assert.New(t) 30 | // 31 | p0 := 0 32 | p1 := 1 33 | ts0 := newTimestamp(0, p0) 34 | ts1 := newTimestamp(1, p1) 35 | r := newResource(1) 36 | r.Occupy(ts0) 37 | // 38 | expected := fmt.Sprintf("资源正在被 %s 占据,%s 却想获取资源。", ts0, ts1) 39 | ast.PanicsWithValue(expected, func() { r.Occupy(ts1) }) 40 | } 41 | 42 | func Test_resource_occupy_panicOfEarlyTimestampWantToOccupy(t *testing.T) { 43 | ast := assert.New(t) 44 | // 45 | ts0 := newTimestamp(0, 1) 46 | ts1 := newTimestamp(1, 1) 47 | r := newResource(2) 48 | r.Occupy(ts1) 49 | r.Release(ts1) 50 | // 51 | expected := fmt.Sprintf("资源上次被 %s 占据,这次 %s 却想占据资源。", ts1, ts0) 52 | ast.PanicsWithValue(expected, func() { r.Occupy(ts0) }) 53 | } 54 | 55 | func Test_resource_report(t *testing.T) { 56 | ast := assert.New(t) 57 | // 58 | p := 0 59 | ts0 := newTimestamp(0, p) 60 | ts1 := newTimestamp(1, p) 61 | r := newResource(3) 62 | r.Occupy(ts0) 63 | r.Release(ts0) 64 | r.Occupy(ts1) 65 | r.Release(ts1) 66 | now := time.Now() 67 | r.times[0] = now 68 | r.times[1] = now.Add(100 * time.Second) 69 | r.times[2] = now.Add(200 * time.Second) 70 | r.times[3] = now.Add(400 * time.Second) 71 | // 72 | report := r.report() 73 | ast.True(strings.Contains(report, "75.00%"), report) 74 | // 75 | ast.Equal(4, len(r.times), "资源被占用了 2 次,但是 r.times 的长度不等于 4") 76 | } 77 | 78 | func Test_resource_Occupy_lenOfTimes(t *testing.T) { 79 | ast := assert.New(t) 80 | // 81 | times := 100 82 | r := newResource(times) 83 | go func(max int) { 84 | time, p := 0, 0 85 | for i := 0; i < max; i++ { 86 | if i%2 == 0 { 87 | time++ 88 | } else { 89 | p++ 90 | } 91 | ts := newTimestamp(time, p) 92 | r.Occupy(ts) 93 | r.Release(ts) 94 | } 95 | }(times) 96 | r.wait() 97 | expected := times * 2 98 | actual := len(r.times) 99 | ast.Equal(expected, actual) 100 | } 101 | 102 | func Test_resource_Release_panicOfReleaseByOther(t *testing.T) { 103 | ast := assert.New(t) 104 | // 105 | r := newResource(1) 106 | ts0 := newTimestamp(0, 1) 107 | ts1 := newTimestamp(1, 1) 108 | r.Occupy(ts0) 109 | expected := fmt.Sprintf("%s 想要释放正在被 P%s 占据的资源。", ts1, ts0) 110 | ast.PanicsWithValue(expected, func() { r.Release(ts1) }) 111 | } 112 | -------------------------------------------------------------------------------- /Mutual-Exclusion/code/timestamp.go: -------------------------------------------------------------------------------- 1 | package mutualexclusion 2 | 3 | import "fmt" 4 | 5 | // Timestamp 是用于全局排序的接口 6 | type Timestamp interface { 7 | // Less 比较两个 Timestamp 的大小 8 | Less(interface{}) bool 9 | // Equal 判断两个 Timestamp 是否相等 10 | IsEqual(interface{}) bool 11 | // IsBefore 在比较同一个 clock 的时间,所以,不需要 process 12 | IsBefore(int) bool 13 | // String 输出 Timestamp 的内容 14 | String() string 15 | } 16 | 17 | type timestamp struct { 18 | time, process int 19 | } 20 | 21 | func newTimestamp(time, process int) Timestamp { 22 | return ×tamp{ 23 | time: time, 24 | process: process, 25 | } 26 | } 27 | 28 | func (ts *timestamp) String() string { 29 | return fmt.Sprintf("", ts.time, ts.process) 30 | } 31 | 32 | func (ts *timestamp) Less(tsi interface{}) bool { 33 | ts2 := tsi.(*timestamp) 34 | // 这就是将局部顺序推广到全局顺序的关键 35 | if ts.time == ts2.time { 36 | return ts.process < ts2.process 37 | } 38 | return ts.time < ts2.time 39 | } 40 | 41 | func (ts *timestamp) IsEqual(tsi interface{}) bool { 42 | if tsi == nil { 43 | return false 44 | } 45 | ts2 := tsi.(*timestamp) 46 | return ts.time == ts2.time && ts.process == ts2.process 47 | } 48 | 49 | func (ts *timestamp) IsBefore(t int) bool { 50 | return ts.time < t 51 | } 52 | -------------------------------------------------------------------------------- /Mutual-Exclusion/code/timestamp_test.go: -------------------------------------------------------------------------------- 1 | package mutualexclusion 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func Test_timestamp_String(t *testing.T) { 10 | ast := assert.New(t) 11 | ts := newTimestamp(0, 1) 12 | actual := ts.String() 13 | expected := "" 14 | ast.Equal(expected, actual) 15 | } 16 | 17 | func Test_timestamp_Less(t *testing.T) { 18 | ast := assert.New(t) 19 | 20 | // a < b < c 21 | a := newTimestamp(1, 1) 22 | b := newTimestamp(1, 2) 23 | c := newTimestamp(2, 3) 24 | 25 | ast.True(a.Less(b)) 26 | ast.True(a.Less(c)) 27 | ast.True(b.Less(c)) 28 | 29 | ast.False(b.Less(a)) 30 | ast.False(c.Less(a)) 31 | ast.False(c.Less(b)) 32 | } 33 | 34 | func Test_timestamp_IsEqual_nil_false(t *testing.T) { 35 | ast := assert.New(t) 36 | ts := newTimestamp(0, 0) 37 | ast.False(ts.IsEqual(nil)) 38 | } 39 | 40 | func Test_timestamp_IsEqual_same_true(t *testing.T) { 41 | ast := assert.New(t) 42 | time, process := 0, 0 43 | ts := newTimestamp(time, process) 44 | tsi := newTimestamp(time, process) 45 | ast.True(ts.IsEqual(tsi)) 46 | } 47 | 48 | func Test_timestamp_IsBefore(t *testing.T) { 49 | ast := assert.New(t) 50 | time, process := 1, 0 51 | ts := newTimestamp(time, process) 52 | ast.False(ts.IsBefore(0)) 53 | ast.True(ts.IsBefore(2)) 54 | } 55 | -------------------------------------------------------------------------------- /Mutual-Exclusion/code/util.go: -------------------------------------------------------------------------------- 1 | package mutualexclusion 2 | 3 | import ( 4 | "log" 5 | "math/rand" 6 | "sync" 7 | "time" 8 | ) 9 | 10 | func init() { 11 | log.SetFlags(log.LstdFlags | log.Lmicroseconds) 12 | debugPrintf("程序开始运行") 13 | rand.Seed(time.Now().UnixNano()) 14 | } 15 | 16 | var needDebug = false 17 | 18 | // 读取和修改 needebug 前需要上锁 19 | var rwm sync.RWMutex 20 | 21 | // debugPrintf 根据设置打印输出 22 | func debugPrintf(format string, a ...interface{}) { 23 | rwm.RLock() 24 | if needDebug { 25 | log.Printf(format, a...) 26 | } 27 | rwm.RUnlock() 28 | } 29 | 30 | func max(a, b int) int { 31 | if a > b { 32 | return a 33 | } 34 | return b 35 | } 36 | -------------------------------------------------------------------------------- /Mutual-Exclusion/code/util_test.go: -------------------------------------------------------------------------------- 1 | package mutualexclusion 2 | 3 | import ( 4 | "log" 5 | "os" 6 | "strings" 7 | "testing" 8 | 9 | "github.com/stretchr/testify/assert" 10 | ) 11 | 12 | func Test_debugPrintf_toPrint(t *testing.T) { 13 | rwm.Lock() 14 | temp := needDebug 15 | needDebug = true 16 | rwm.Unlock() 17 | // 18 | var sb strings.Builder 19 | log.SetOutput(&sb) 20 | defer log.SetOutput(os.Stderr) 21 | // 22 | ast := assert.New(t) 23 | // 24 | words := "众鸟高飞尽,孤云独去闲。" 25 | // 26 | debugPrintf("%s", words) 27 | // 28 | ast.True(strings.Contains(sb.String(), words)) 29 | // 还原 needDebug 30 | rwm.Lock() 31 | needDebug = temp 32 | rwm.Unlock() 33 | } 34 | 35 | func Test_debugPrintf_notToPrint(t *testing.T) { 36 | rwm.Lock() 37 | temp := needDebug 38 | needDebug = false 39 | rwm.Unlock() 40 | // 41 | var b strings.Builder 42 | log.SetOutput(&b) 43 | defer log.SetOutput(os.Stderr) 44 | // 45 | ast := assert.New(t) 46 | // 47 | words := "众鸟高飞尽,孤云独去闲。" 48 | // 49 | debugPrintf("%s", words) 50 | // 51 | ast.False(strings.Contains(b.String(), words)) 52 | // 还原 needDebug 53 | rwm.Lock() 54 | needDebug = temp 55 | rwm.Unlock() 56 | } 57 | 58 | func Test_max(t *testing.T) { 59 | type args struct { 60 | a int 61 | b int 62 | } 63 | tests := []struct { 64 | name string 65 | args args 66 | want int 67 | }{ 68 | 69 | { 70 | "a < b", 71 | args{ 72 | 1, 73 | 2, 74 | }, 75 | 2, 76 | }, 77 | 78 | { 79 | "a > b", 80 | args{ 81 | 2, 82 | 1, 83 | }, 84 | 2, 85 | }, 86 | 87 | { 88 | "a = b", 89 | args{ 90 | 2, 91 | 2, 92 | }, 93 | 2, 94 | }, 95 | } 96 | for _, tt := range tests { 97 | t.Run(tt.name, func(t *testing.T) { 98 | if got := max(tt.args.a, tt.args.b); got != tt.want { 99 | t.Errorf("max() = %v, want %v", got, tt.want) 100 | } 101 | }) 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /Mutual-Exclusion/mutual_exclusion_request_period.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aQuaYi/Distributed-Algorithms/8c27e1220fb1c467da999f30244d40f520365522/Mutual-Exclusion/mutual_exclusion_request_period.pptx -------------------------------------------------------------------------------- /Mutual-Exclusion/qna.md: -------------------------------------------------------------------------------- 1 | # 思考问题 2 | 3 | 1.为什么会出现多种全局排序?请举例说明。 4 | 5 | ```text 6 | 由于不同 process 的 event 可能会有相同的 timestamp,例如 E1 = `<3:7>`, E2 = `<3:5>`。 7 | 如果 P7 < P5 的话, E1 => E2。 8 | 如果 P5 < P7 的话, E2 => E1。 9 | ``` 10 | 11 | 2.真实时间上先 request 的 process 会不会后得到 resource?如果会的话,能不能说明 mutual exclusion 算法失败了?请说明理由。 12 | 13 | ```text 14 | 会,但不是全局排序的失败。 15 | 16 | 我在编程的时候,就遇到了这个问题。开始以为是程序的 bug 。后来重新阅读的论文,才发现是自己的理解的不够。 17 | 18 | 首先区分一下时间(time)和时刻(timestamp),时间是一个物理量,时刻是这个物理量的值,2018年05月15日15:20:55 是现在的时刻。就好像温度是一个物理量,33℃是温度的一个值。但是如果33℃的物体比44℃的物体摸起来要热,只能说明这个物体不是使用同一个温度计测量的温度,并且两个温度计的基准差别还蛮大。 19 | 第二,时刻(timestamp)的作用是给 event 一个标记,多个 event 可以利用这个时间标记进行排序。例如,同一天中,E1(15:31:31) 排在 E2(15:31:51) 前面。但这包含了一个隐含前提,这两个 event 的时刻,是由同一个可靠的 clock 标记的。 20 | 第三,mutual exclusion 是一个分布式算法。每个 process 都有自己单独的 clock。不同 process 中的 event 的时间标记都是不同的 clock 标记的。考虑到程序运行的速度,这些 clock 与真实时间之间的偏差,绝对不能忽略不计。 21 | 第四,为了 process 间的局部排序,引入了 message 机制,并制定了 lamport timestamp 规则。为了全局排序,再引入 process 排序。 22 | 23 | 再解释一下题意,存在一个观察者,拿着同一个 clock 去分别标记每个 process 的 request,结果发现某个先标记的 request 却后得到了 resource。 24 | 这不能说明 mutual exclusion 算法失败的原因是,这个算法就是为了解决分布式系统中,不可能存在同一个 clock 去分别标记每个 process 的 request 的问题而提出的。 25 | 26 | 如果能像 Google 在 [Spanner](spanner-osdi2012.pdf) 里面,引入 True Time 一样,使得各个 process 的 clock 之间的偏差,相对于程序的速度可以忽略不计。就可以保证真实时间上先 request 的 process 先占用 resource。那样的话,也不需要 mutual exclusion 算法了。 27 | ``` 28 | -------------------------------------------------------------------------------- /Mutual-Exclusion/request_period.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aQuaYi/Distributed-Algorithms/8c27e1220fb1c467da999f30244d40f520365522/Mutual-Exclusion/request_period.png -------------------------------------------------------------------------------- /Mutual-Exclusion/spanner-osdi2012.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aQuaYi/Distributed-Algorithms/8c27e1220fb1c467da999f30244d40f520365522/Mutual-Exclusion/spanner-osdi2012.pdf -------------------------------------------------------------------------------- /Mutual-Exclusion/time-clocks.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aQuaYi/Distributed-Algorithms/8c27e1220fb1c467da999f30244d40f520365522/Mutual-Exclusion/time-clocks.pdf -------------------------------------------------------------------------------- /PoW.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aQuaYi/Distributed-Algorithms/8c27e1220fb1c467da999f30244d40f520365522/PoW.zip -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 分布式系统原型 2 | 3 | [![License](http://img.shields.io/badge/license-MIT-red.svg?style=flat)](https://github.com/aQuaYi/Distributed-Algorithms/blob/master/LICENSE) 4 | [![Build Status](https://www.travis-ci.org/aQuaYi/Distributed-Algorithms.svg?branch=master)](https://www.travis-ci.org/aQuaYi/Distributed-Algorithms) 5 | [![codecov](https://codecov.io/gh/aQuaYi/Distributed-Algorithms/branch/master/graph/badge.svg)](https://codecov.io/gh/aQuaYi/Distributed-Algorithms) 6 | [![Go Report Card](https://goreportcard.com/badge/github.com/aQuaYi/Distributed-Algorithms)](https://goreportcard.com/report/github.com/aQuaYi/Distributed-Algorithms) 7 | 8 | ## [Mutual Exclusion](Mutual-Exclusion) 9 | 10 | Lamport 在论文《Time, Clocks and the Ordering of Events in a Distributed System》中提到的 Mutual Exclusion 算法。 11 | 12 | ## [Raft](Raft) 13 | 14 | Diego Ongaro 和 John Ousterhout 认为 Paxos 难以理解, 于是在 [《In Search of an Understandable Consensus Algorithm (Extended Version)》](Raft/raft-extended.pdf) 中以可理解为目标,提出了一种新的共识算法——Raft。 15 | 16 | ## [PoW](PoW) 17 | 18 | 为了实现去中心化的数字货币--[Bitcoin](https://github.com/bitcoin/bitcoin) 19 | ,[中本聪](https://zh.wikipedia.org/zh-hans/%E4%B8%AD%E6%9C%AC%E8%81%AA)利用 PoW (Proof of Work) 算法来解决系统中的拜占庭将军问题。 20 | 21 | ## PoS 22 | 23 | ## DPoS 24 | 25 | ## PBFT 26 | 27 | TODO: 总结分布式系统的特点 -------------------------------------------------------------------------------- /Raft/6.824-2018.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aQuaYi/Distributed-Algorithms/8c27e1220fb1c467da999f30244d40f520365522/Raft/6.824-2018.zip -------------------------------------------------------------------------------- /Raft/6.824Lab2_Raft.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aQuaYi/Distributed-Algorithms/8c27e1220fb1c467da999f30244d40f520365522/Raft/6.824Lab2_Raft.html -------------------------------------------------------------------------------- /Raft/6.824Lab2_Raft_files/style.css: -------------------------------------------------------------------------------- 1 | body { max-width: 45em; } 2 | body pre { overflow-x: auto; } 3 | 4 | body { 5 | color: black; 6 | background-color: white; 7 | font-family: sans-serif; 8 | } 9 | 10 | .title { 11 | text-align: center 12 | } 13 | .subtitle { 14 | text-align: center; 15 | font-style: italic; 16 | } 17 | .author { 18 | text-align: center; 19 | } 20 | 21 | ul.hints, .note, .challenge, .todo, pre { 22 | margin: 1em; 23 | border: 1px dashed; 24 | padding: 1em; 25 | } 26 | 27 | ul.hints { color: #50A02D; } 28 | ul.hints li { margin-left: 1em; } 29 | ul.hints li::before { 30 | content: "Hint: "; 31 | font-weight: bold; 32 | } 33 | 34 | .important { 35 | margin: 1em; 36 | padding: 1em; 37 | background-color: #990000; 38 | color: #fff; 39 | } 40 | .important::before { 41 | content: "Important: "; 42 | background-color: #550000; 43 | width: 100%; 44 | display: block; 45 | margin: -1em -1em 1em -1em; 46 | padding: 1em; 47 | font-weight: bold; 48 | } 49 | .note { color: #4682B4; } 50 | .note::before { 51 | content: "Note: "; 52 | font-weight: bold; 53 | } 54 | 55 | .challenge, .todo { border-style: solid; } 56 | .challenge::before, .todo::before { 57 | float: right; 58 | font-weight: bold; 59 | color: white; 60 | margin-right: -1em; 61 | margin-top: -1em; 62 | margin-bottom: .5em; 63 | margin-left: 1em; 64 | padding: .5em 1em; 65 | } 66 | .todo { color: #B22222; } 67 | .todo::before { 68 | content: "TASK"; 69 | background: #B22222; 70 | } 71 | .challenge { color: #8B4513; } 72 | .challenge::before { 73 | content: "CHALLENGE"; 74 | background: #8B4513; 75 | } 76 | 77 | 78 | tt, code { 79 | font-family: monospace; 80 | border-radius: 3px; 81 | font-size: 110%; 82 | color: #657b83; 83 | background-color: #fdf6e3; 84 | padding: 0.2em; 85 | word-wrap: break-word; 86 | } 87 | 88 | pre { 89 | font-size: 100%; 90 | color: #839496; 91 | background: #002b36; 92 | } 93 | 94 | .classic { 95 | color: black; 96 | } 97 | 98 | 99 | div.required .header { 100 | font-weight: bold; 101 | } 102 | 103 | div.challenge .header { 104 | font-style: italic; 105 | } 106 | 107 | div.required { 108 | background-color: #eeeeff; 109 | } 110 | 111 | -------------------------------------------------------------------------------- /Raft/README.md: -------------------------------------------------------------------------------- 1 | # Raft: 一个比 Paxos 好懂的共识算法 2 | 3 | Diego Ongaro 和 John Ousterhout 认为 Paxos 难以理解, 于是在 [《In Search of an Understandable Consensus Algorithm (Extended Version)》](raft-extended.pdf) 中以可理解为目标,提出了一种新的共识算法——Raft。 4 | 5 | 本 demo 的测试代码及其辅助库来源于 [MIT 6.824 课程](https://pdos.csail.mit.edu/6.824/) 的课程实验。原始代码[点击这里下载](6.824-2018.zip),代码说明在[此页面](6.824Lab2_Raft.html)。 6 | 7 | 相关资料: 8 | 9 | - [《In Search of an Understandable Consensus Algorithm (Extended Version)》](raft-extended.pdf) 10 | - [Raft 算法演示](http://thesecretlivesofdata.com/raft/) 11 | -------------------------------------------------------------------------------- /Raft/code/config.go: -------------------------------------------------------------------------------- 1 | package raft 2 | 3 | // 4 | // support for Raft tester. 5 | // 6 | // we will use the original config.go to test your code for grading. 7 | // so, while you can modify this code to help you debug, please 8 | // test with the original before submitting. 9 | // 10 | 11 | import ( 12 | "log" 13 | "math/rand" 14 | "runtime" 15 | "sync" 16 | "testing" 17 | 18 | crand "crypto/rand" 19 | "encoding/base64" 20 | "fmt" 21 | "math/big" 22 | "time" 23 | 24 | "github.com/aQuaYi/Distributed-Algorithms/Raft/code/labrpc" 25 | ) 26 | 27 | func randstring(n int) string { 28 | b := make([]byte, 2*n) 29 | crand.Read(b) 30 | s := base64.URLEncoding.EncodeToString(b) 31 | return s[0:n] 32 | } 33 | 34 | func makeSeed() int64 { 35 | max := big.NewInt(int64(1) << 62) 36 | bigx, _ := crand.Int(crand.Reader, max) 37 | x := bigx.Int64() 38 | return x 39 | } 40 | 41 | type config struct { 42 | mu sync.Mutex 43 | t *testing.T 44 | net *labrpc.Network 45 | n int 46 | rafts []*Raft 47 | applyErr []string // from apply channel readers 48 | connected []bool // whether each server is on the net 49 | saved []*Persister 50 | endnames [][]string // the port file names each sends to 51 | logs []map[int]int // copy of each server's committed entries 52 | start time.Time // time at which make_config() was called 53 | // begin()/end() statistics 54 | beginTime time.Time // time at which test_test.go called cfg.begin() 55 | rpcs0 int // rpcTotal() at start of test 56 | cmds0 int // number of agreements 57 | maxIndex int 58 | maxIndex0 int 59 | } 60 | 61 | var ncpuOnce sync.Once 62 | 63 | func makeConfig(t *testing.T, n int, unreliable bool) *config { 64 | ncpuOnce.Do(func() { 65 | if runtime.NumCPU() < 2 { 66 | fmt.Printf("warning: only one CPU, which may conceal locking bugs\n") 67 | } 68 | rand.Seed(makeSeed()) 69 | }) 70 | runtime.GOMAXPROCS(4) 71 | cfg := &config{} 72 | cfg.t = t 73 | cfg.net = labrpc.MakeNetwork() 74 | cfg.n = n 75 | cfg.applyErr = make([]string, cfg.n) 76 | cfg.rafts = make([]*Raft, cfg.n) 77 | cfg.connected = make([]bool, cfg.n) 78 | cfg.saved = make([]*Persister, cfg.n) 79 | cfg.endnames = make([][]string, cfg.n) 80 | cfg.logs = make([]map[int]int, cfg.n) 81 | cfg.start = time.Now() 82 | 83 | cfg.setunreliable(unreliable) 84 | 85 | cfg.net.LongDelays(true) 86 | 87 | // create a full set of Rafts. 88 | for i := 0; i < cfg.n; i++ { 89 | cfg.logs[i] = map[int]int{} 90 | cfg.start1(i) 91 | } 92 | 93 | // connect everyone 94 | for i := 0; i < cfg.n; i++ { 95 | cfg.connect(i) 96 | } 97 | 98 | return cfg 99 | } 100 | 101 | // shut down a Raft server but save its persistent state. 102 | func (cfg *config) crash1(i int) { 103 | cfg.disconnect(i) 104 | cfg.net.DeleteServer(i) // disable client connections to the server. 105 | 106 | cfg.mu.Lock() 107 | defer cfg.mu.Unlock() 108 | 109 | // a fresh persister, in case old instance 110 | // continues to update the Persister. 111 | // but copy old persister's content so that we always 112 | // pass Make() the last persisted state. 113 | if cfg.saved[i] != nil { 114 | cfg.saved[i] = cfg.saved[i].Copy() 115 | } 116 | 117 | rf := cfg.rafts[i] 118 | if rf != nil { 119 | cfg.mu.Unlock() 120 | rf.Kill() 121 | cfg.mu.Lock() 122 | cfg.rafts[i] = nil 123 | } 124 | 125 | if cfg.saved[i] != nil { 126 | raftlog := cfg.saved[i].ReadRaftState() 127 | cfg.saved[i] = &Persister{} 128 | cfg.saved[i].SaveRaftState(raftlog) 129 | } 130 | } 131 | 132 | // 133 | // start or re-start a Raft. 134 | // if one already exists, "kill" it first. 135 | // allocate new outgoing port file names, and a new 136 | // state persister, to isolate previous instance of 137 | // this server. since we cannot really kill it. 138 | // 139 | func (cfg *config) start1(i int) { 140 | cfg.crash1(i) 141 | 142 | // a fresh set of outgoing ClientEnd names. 143 | // so that old crashed instance's ClientEnds can't send. 144 | cfg.endnames[i] = make([]string, cfg.n) 145 | for j := 0; j < cfg.n; j++ { 146 | cfg.endnames[i][j] = randstring(20) 147 | } 148 | 149 | // a fresh set of ClientEnds. 150 | ends := make([]*labrpc.ClientEnd, cfg.n) 151 | for j := 0; j < cfg.n; j++ { 152 | ends[j] = cfg.net.MakeEnd(cfg.endnames[i][j]) 153 | cfg.net.Connect(cfg.endnames[i][j], j) 154 | } 155 | 156 | cfg.mu.Lock() 157 | 158 | // a fresh persister, so old instance doesn't overwrite 159 | // new instance's persisted state. 160 | // but copy old persister's content so that we always 161 | // pass Make() the last persisted state. 162 | if cfg.saved[i] != nil { 163 | cfg.saved[i] = cfg.saved[i].Copy() 164 | } else { 165 | cfg.saved[i] = MakePersister() 166 | } 167 | 168 | cfg.mu.Unlock() 169 | 170 | // listen to messages from Raft indicating newly committed messages. 171 | applyCh := make(chan ApplyMsg) 172 | go func() { 173 | for m := range applyCh { 174 | errMsg := "" 175 | if m.CommandValid == false { 176 | // ignore other types of ApplyMsg 177 | } else if v, ok := (m.Command).(int); ok { 178 | cfg.mu.Lock() 179 | for j := 0; j < len(cfg.logs); j++ { 180 | if old, oldOK := cfg.logs[j][m.CommandIndex]; oldOK && old != v { 181 | // some server has already committed a different value for this entry! 182 | errMsg = fmt.Sprintf("commit index=%v server=%v %v != server=%v %v", 183 | m.CommandIndex, i, m.Command, j, old) 184 | } 185 | } 186 | _, prevOK := cfg.logs[i][m.CommandIndex-1] 187 | cfg.logs[i][m.CommandIndex] = v 188 | if m.CommandIndex > cfg.maxIndex { 189 | cfg.maxIndex = m.CommandIndex 190 | } 191 | cfg.mu.Unlock() 192 | 193 | if m.CommandIndex > 1 && prevOK == false { 194 | errMsg = fmt.Sprintf("server %v apply out of order %v", i, m.CommandIndex) 195 | } 196 | } else { 197 | errMsg = fmt.Sprintf("committed command %v is not an int", m.Command) 198 | } 199 | 200 | if errMsg != "" { 201 | log.Fatalf("apply error: %v\n", errMsg) 202 | cfg.applyErr[i] = errMsg 203 | // keep reading after error so that Raft doesn't block 204 | // holding locks... 205 | } 206 | } 207 | }() 208 | 209 | rf := Make(ends, i, cfg.saved[i], applyCh) 210 | 211 | cfg.mu.Lock() 212 | cfg.rafts[i] = rf 213 | cfg.mu.Unlock() 214 | 215 | svc := labrpc.MakeService(rf) 216 | srv := labrpc.MakeServer() 217 | srv.AddService(svc) 218 | cfg.net.AddServer(i, srv) 219 | } 220 | 221 | func (cfg *config) checkTimeout() { 222 | // enforce a two minute real-time limit on each test 223 | if !cfg.t.Failed() && time.Since(cfg.start) > 120*time.Second { 224 | cfg.t.Fatal("test took longer than 120 seconds") 225 | } 226 | } 227 | 228 | func (cfg *config) cleanup() { 229 | for i := 0; i < len(cfg.rafts); i++ { 230 | if cfg.rafts[i] != nil { 231 | cfg.rafts[i].Kill() 232 | } 233 | } 234 | cfg.net.Cleanup() 235 | cfg.checkTimeout() 236 | } 237 | 238 | // attach server i to the net. 239 | func (cfg *config) connect(i int) { 240 | // fmt.Printf("connect(%d)\n", i) 241 | 242 | cfg.connected[i] = true 243 | 244 | // outgoing ClientEnds 245 | for j := 0; j < cfg.n; j++ { 246 | if cfg.connected[j] { 247 | endname := cfg.endnames[i][j] 248 | cfg.net.Enable(endname, true) 249 | } 250 | } 251 | 252 | // incoming ClientEnds 253 | for j := 0; j < cfg.n; j++ { 254 | if cfg.connected[j] { 255 | endname := cfg.endnames[j][i] 256 | cfg.net.Enable(endname, true) 257 | } 258 | } 259 | } 260 | 261 | // detach server i from the net. 262 | func (cfg *config) disconnect(i int) { 263 | // fmt.Printf("disconnect(%d)\n", i) 264 | 265 | cfg.connected[i] = false 266 | 267 | // outgoing ClientEnds 268 | for j := 0; j < cfg.n; j++ { 269 | if cfg.endnames[i] != nil { 270 | endname := cfg.endnames[i][j] 271 | cfg.net.Enable(endname, false) 272 | } 273 | } 274 | 275 | // incoming ClientEnds 276 | for j := 0; j < cfg.n; j++ { 277 | if cfg.endnames[j] != nil { 278 | endname := cfg.endnames[j][i] 279 | cfg.net.Enable(endname, false) 280 | } 281 | } 282 | } 283 | 284 | func (cfg *config) rpcCount(server int) int { 285 | return cfg.net.GetCount(server) 286 | } 287 | 288 | func (cfg *config) rpcTotal() int { 289 | return cfg.net.GetTotalCount() 290 | } 291 | 292 | func (cfg *config) setunreliable(unRel bool) { 293 | cfg.net.Reliable(!unRel) 294 | } 295 | 296 | func (cfg *config) setlongreordering(longrel bool) { 297 | cfg.net.LongReordering(longrel) 298 | } 299 | 300 | // check that there's exactly one leader. 301 | // try a few times in case re-elections are needed. 302 | func (cfg *config) checkOneLeader() int { 303 | for iters := 0; iters < 10; iters++ { 304 | ms := 450 + (rand.Int63() % 100) 305 | time.Sleep(time.Duration(ms) * time.Millisecond) 306 | 307 | leaders := make(map[int][]int) 308 | for i := 0; i < cfg.n; i++ { 309 | if cfg.connected[i] { 310 | if term, leader := cfg.rafts[i].GetState(); leader { 311 | leaders[term] = append(leaders[term], i) 312 | } 313 | } 314 | } 315 | 316 | lastTermWithLeader := -1 317 | for term, leaders := range leaders { 318 | if len(leaders) > 1 { 319 | cfg.t.Fatalf("term %d has %d (>1) leaders", term, len(leaders)) 320 | } 321 | if term > lastTermWithLeader { 322 | lastTermWithLeader = term 323 | } 324 | } 325 | 326 | if len(leaders) != 0 { 327 | return leaders[lastTermWithLeader][0] 328 | } 329 | } 330 | cfg.t.Fatalf("expected one leader, got none") 331 | return -1 332 | } 333 | 334 | // check that everyone agrees on the term. 335 | func (cfg *config) checkTerms() int { 336 | term := -1 337 | for i := 0; i < cfg.n; i++ { 338 | if cfg.connected[i] { 339 | xterm, _ := cfg.rafts[i].GetState() 340 | if term == -1 { 341 | term = xterm 342 | } else if term != xterm { 343 | cfg.t.Fatalf("servers disagree on term") 344 | } 345 | } 346 | } 347 | return term 348 | } 349 | 350 | // check that there's no leader 351 | func (cfg *config) checkNoLeader() { 352 | for i := 0; i < cfg.n; i++ { 353 | if cfg.connected[i] { 354 | _, isLeader := cfg.rafts[i].GetState() 355 | if isLeader { 356 | cfg.t.Fatalf("expected no leader, but %v claims to be leader", i) 357 | } 358 | } 359 | } 360 | } 361 | 362 | // how many servers think a log entry is committed? 363 | func (cfg *config) nCommitted(index int) (int, interface{}) { 364 | count := 0 365 | cmd := -1 366 | for i := 0; i < len(cfg.rafts); i++ { 367 | if cfg.applyErr[i] != "" { 368 | cfg.t.Fatal(cfg.applyErr[i]) 369 | } 370 | 371 | cfg.mu.Lock() 372 | cmd1, ok := cfg.logs[i][index] 373 | cfg.mu.Unlock() 374 | 375 | if ok { 376 | if count > 0 && cmd != cmd1 { 377 | cfg.t.Fatalf("committed values do not match: index %v, %v, %v\n", 378 | index, cmd, cmd1) 379 | } 380 | count++ 381 | cmd = cmd1 382 | } 383 | } 384 | return count, cmd 385 | } 386 | 387 | // wait for at least n servers to commit. 388 | // but don't wait forever. 389 | func (cfg *config) wait(index int, n int, startTerm int) interface{} { 390 | to := 10 * time.Millisecond 391 | for iters := 0; iters < 30; iters++ { 392 | nd, _ := cfg.nCommitted(index) 393 | if nd >= n { 394 | break 395 | } 396 | time.Sleep(to) 397 | if to < time.Second { 398 | to *= 2 399 | } 400 | if startTerm > -1 { 401 | for _, r := range cfg.rafts { 402 | if t, _ := r.GetState(); t > startTerm { 403 | // someone has moved on 404 | // can no longer guarantee that we'll "win" 405 | return -1 406 | } 407 | } 408 | } 409 | } 410 | nd, cmd := cfg.nCommitted(index) 411 | if nd < n { 412 | cfg.t.Fatalf("only %d decided for index %d; wanted %d\n", 413 | nd, index, n) 414 | } 415 | return cmd 416 | } 417 | 418 | // do a complete agreement. 419 | // it might choose the wrong leader initially, 420 | // and have to re-submit after giving up. 421 | // entirely gives up after about 10 seconds. 422 | // indirectly checks that the servers agree on the 423 | // same value, since nCommitted() checks this, 424 | // as do the threads that read from applyCh. 425 | // returns index. 426 | // if retry==true, may submit the command multiple 427 | // times, in case a leader fails just after Start(). 428 | // if retry==false, calls Start() only once, in order 429 | // to simplify the early Lab 2B tests. 430 | func (cfg *config) one(cmd int, expectedServers int, retry bool) int { 431 | t0 := time.Now() 432 | starts := 0 433 | for time.Since(t0).Seconds() < 10 { 434 | // try all the servers, maybe one is the leader. 435 | index := -1 436 | for si := 0; si < cfg.n; si++ { 437 | starts = (starts + 1) % cfg.n 438 | var rf *Raft 439 | cfg.mu.Lock() 440 | if cfg.connected[starts] { 441 | rf = cfg.rafts[starts] 442 | } 443 | cfg.mu.Unlock() 444 | if rf != nil { 445 | index1, _, ok := rf.Start(cmd) 446 | if ok { 447 | index = index1 448 | cfg.t.Logf(" ## %v 的 logIndex: %d, %s %s ", cmd, index, rf, rf.details()) 449 | break 450 | } 451 | } 452 | } 453 | 454 | if index != -1 { 455 | // somebody claimed to be the leader and to have 456 | // submitted our command; wait a while for agreement. 457 | t1 := time.Now() 458 | for time.Since(t1).Seconds() < 2 { 459 | nd, cmd1 := cfg.nCommitted(index) 460 | cfg.t.Logf(" ## %d/%d raft 的 logIndex:%d 的值为 %v", nd, len(cfg.logs), index, cmd1) 461 | if nd > 0 && nd >= expectedServers { 462 | // committed 463 | if cmd2, ok := cmd1.(int); ok && cmd2 == cmd { 464 | // and it was the command we submitted. 465 | return index 466 | } 467 | } 468 | time.Sleep(20 * time.Millisecond) 469 | } 470 | if retry == false { 471 | cfg.t.Fatalf("one(%v) failed to reach agreement", cmd) 472 | } 473 | } else { 474 | time.Sleep(50 * time.Millisecond) 475 | } 476 | } 477 | cfg.t.Fatalf("one(%v) failed to reach agreement", cmd) 478 | return -1 479 | } 480 | 481 | // start a Test. 482 | // print the Test message. 483 | // e.g. cfg.begin("Test (2B): RPC counts aren't too high") 484 | func (cfg *config) begin(description string) { 485 | fmt.Printf("%s ...\n", description) 486 | cfg.beginTime = time.Now() 487 | cfg.rpcs0 = cfg.rpcTotal() 488 | cfg.cmds0 = 0 489 | cfg.maxIndex0 = cfg.maxIndex 490 | } 491 | 492 | // end a Test -- the fact that we got here means there 493 | // was no failure. 494 | // print the Passed message, 495 | // and some performance numbers. 496 | func (cfg *config) end() { 497 | cfg.checkTimeout() 498 | if cfg.t.Failed() == false { 499 | cfg.mu.Lock() 500 | t := time.Since(cfg.beginTime).Seconds() // real time 501 | nPeers := cfg.n // number of Raft peers 502 | nRPC := cfg.rpcTotal() - cfg.rpcs0 // number of RPC sends 503 | nCMDs := cfg.maxIndex - cfg.maxIndex0 // number of Raft agreements reported 504 | cfg.mu.Unlock() 505 | 506 | fmt.Printf(" ... Passed --") 507 | fmt.Printf(" %4.1f %d %4d %4d\n", t, nPeers, nRPC, nCMDs) 508 | } 509 | } 510 | -------------------------------------------------------------------------------- /Raft/code/labgob/labgob.go: -------------------------------------------------------------------------------- 1 | package labgob 2 | 3 | // 4 | // trying to send non-capitalized fields over RPC produces a range of 5 | // misbehavior, including both mysterious incorrect computation and 6 | // outright crashes. so this wrapper around Go's encoding/gob warns 7 | // about non-capitalized field names. 8 | // 9 | 10 | import "encoding/gob" 11 | import "io" 12 | import "reflect" 13 | import "fmt" 14 | import "sync" 15 | import "unicode" 16 | import "unicode/utf8" 17 | 18 | var mu sync.Mutex 19 | var errorCount int // for TestCapital 20 | var checked map[reflect.Type]bool 21 | 22 | // LabEncoder is 23 | type LabEncoder struct { 24 | gob *gob.Encoder 25 | } 26 | 27 | // NewEncoder is 28 | func NewEncoder(w io.Writer) *LabEncoder { 29 | enc := &LabEncoder{} 30 | enc.gob = gob.NewEncoder(w) 31 | return enc 32 | } 33 | 34 | // Encode is 35 | func (enc *LabEncoder) Encode(e interface{}) error { 36 | checkValue(e) 37 | return enc.gob.Encode(e) 38 | } 39 | 40 | // EncodeValue is 41 | func (enc *LabEncoder) EncodeValue(value reflect.Value) error { 42 | checkValue(value.Interface()) 43 | return enc.gob.EncodeValue(value) 44 | } 45 | 46 | // LabDecoder is 47 | type LabDecoder struct { 48 | gob *gob.Decoder 49 | } 50 | 51 | // NewDecoder is 52 | func NewDecoder(r io.Reader) *LabDecoder { 53 | dec := &LabDecoder{} 54 | dec.gob = gob.NewDecoder(r) 55 | return dec 56 | } 57 | 58 | // Decode is 59 | func (dec *LabDecoder) Decode(e interface{}) error { 60 | checkValue(e) 61 | checkDefault(e) 62 | return dec.gob.Decode(e) 63 | } 64 | 65 | // Register is 66 | func Register(value interface{}) { 67 | checkValue(value) 68 | gob.Register(value) 69 | } 70 | 71 | // RegisterName is 72 | func RegisterName(name string, value interface{}) { 73 | checkValue(value) 74 | gob.RegisterName(name, value) 75 | } 76 | 77 | func checkValue(value interface{}) { 78 | checkType(reflect.TypeOf(value)) 79 | } 80 | 81 | func checkType(t reflect.Type) { 82 | k := t.Kind() 83 | 84 | mu.Lock() 85 | // only complain once, and avoid recursion. 86 | if checked == nil { 87 | checked = map[reflect.Type]bool{} 88 | } 89 | if checked[t] { 90 | mu.Unlock() 91 | return 92 | } 93 | checked[t] = true 94 | mu.Unlock() 95 | 96 | switch k { 97 | case reflect.Struct: 98 | for i := 0; i < t.NumField(); i++ { 99 | f := t.Field(i) 100 | rune, _ := utf8.DecodeRuneInString(f.Name) 101 | if unicode.IsUpper(rune) == false { 102 | // ta da 103 | fmt.Printf("labgob error: lower-case field %v of %v in RPC or persist/snapshot will break your Raft\n", 104 | f.Name, t.Name()) 105 | mu.Lock() 106 | errorCount++ 107 | mu.Unlock() 108 | } 109 | checkType(f.Type) 110 | } 111 | return 112 | case reflect.Slice, reflect.Array, reflect.Ptr: 113 | checkType(t.Elem()) 114 | return 115 | case reflect.Map: 116 | checkType(t.Elem()) 117 | checkType(t.Key()) 118 | return 119 | default: 120 | return 121 | } 122 | } 123 | 124 | // 125 | // warn if the value contains non-default values, 126 | // as it would if one sent an RPC but the reply 127 | // struct was already modified. if the RPC reply 128 | // contains default values, GOB won't overwrite 129 | // the non-default value. 130 | // 131 | func checkDefault(value interface{}) { 132 | if value == nil { 133 | return 134 | } 135 | checkDefault1(reflect.ValueOf(value), 1, "") 136 | } 137 | 138 | func checkDefault1(value reflect.Value, depth int, name string) { 139 | if depth > 3 { 140 | return 141 | } 142 | 143 | t := value.Type() 144 | k := t.Kind() 145 | 146 | switch k { 147 | case reflect.Struct: 148 | for i := 0; i < t.NumField(); i++ { 149 | vv := value.Field(i) 150 | name1 := t.Field(i).Name 151 | if name != "" { 152 | name1 = name + "." + name1 153 | } 154 | checkDefault1(vv, depth+1, name1) 155 | } 156 | return 157 | case reflect.Ptr: 158 | if value.IsNil() { 159 | return 160 | } 161 | checkDefault1(value.Elem(), depth+1, name) 162 | return 163 | case reflect.Bool, 164 | reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, 165 | reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, 166 | reflect.Uintptr, reflect.Float32, reflect.Float64, 167 | reflect.String: 168 | if reflect.DeepEqual(reflect.Zero(t).Interface(), value.Interface()) == false { 169 | mu.Lock() 170 | if errorCount < 1 { 171 | what := name 172 | if what == "" { 173 | what = t.Name() 174 | } 175 | // this warning typically arises if code re-uses the same RPC reply 176 | // variable for multiple RPC calls, or if code restores persisted 177 | // state into variable that already have non-default values. 178 | fmt.Printf("labgob warning: Decoding into a non-default variable/field %v may not work\n", 179 | what) 180 | } 181 | errorCount++ 182 | mu.Unlock() 183 | } 184 | return 185 | } 186 | } 187 | -------------------------------------------------------------------------------- /Raft/code/labgob/test_test.go: -------------------------------------------------------------------------------- 1 | package labgob 2 | 3 | import "testing" 4 | 5 | import "bytes" 6 | 7 | type T1 struct { 8 | T1int0 int 9 | T1int1 int 10 | T1string0 string 11 | T1string1 string 12 | } 13 | 14 | type T2 struct { 15 | T2slice []T1 16 | T2map map[int]*T1 17 | T2t3 interface{} 18 | } 19 | 20 | type T3 struct { 21 | T3int999 int 22 | } 23 | 24 | // 25 | // test that we didn't break GOB. 26 | // 27 | func TestGOB(t *testing.T) { 28 | e0 := errorCount 29 | 30 | w := new(bytes.Buffer) 31 | 32 | Register(T3{}) 33 | 34 | { 35 | x0 := 0 36 | x1 := 1 37 | t1 := T1{} 38 | t1.T1int1 = 1 39 | t1.T1string1 = "6.824" 40 | t2 := T2{} 41 | t2.T2slice = []T1{T1{}, t1} 42 | t2.T2map = map[int]*T1{} 43 | t2.T2map[99] = &T1{1, 2, "x", "y"} 44 | t2.T2t3 = T3{999} 45 | 46 | e := NewEncoder(w) 47 | e.Encode(x0) 48 | e.Encode(x1) 49 | e.Encode(t1) 50 | e.Encode(t2) 51 | } 52 | data := w.Bytes() 53 | 54 | { 55 | var x0 int 56 | var x1 int 57 | var t1 T1 58 | var t2 T2 59 | 60 | r := bytes.NewBuffer(data) 61 | d := NewDecoder(r) 62 | if d.Decode(&x0) != nil || 63 | d.Decode(&x1) != nil || 64 | d.Decode(&t1) != nil || 65 | d.Decode(&t2) != nil { 66 | t.Fatalf("Decode failed") 67 | } 68 | 69 | if x0 != 0 { 70 | t.Fatalf("wrong x0 %v\n", x0) 71 | } 72 | if x1 != 1 { 73 | t.Fatalf("wrong x1 %v\n", x1) 74 | } 75 | if t1.T1int0 != 0 { 76 | t.Fatalf("wrong t1.T1int0 %v\n", t1.T1int0) 77 | } 78 | if t1.T1int1 != 1 { 79 | t.Fatalf("wrong t1.T1int1 %v\n", t1.T1int1) 80 | } 81 | if t1.T1string0 != "" { 82 | t.Fatalf("wrong t1.T1string0 %v\n", t1.T1string0) 83 | } 84 | if t1.T1string1 != "6.824" { 85 | t.Fatalf("wrong t1.T1string1 %v\n", t1.T1string1) 86 | } 87 | if len(t2.T2slice) != 2 { 88 | t.Fatalf("wrong t2.T2slice len %v\n", len(t2.T2slice)) 89 | } 90 | if t2.T2slice[1].T1int1 != 1 { 91 | t.Fatalf("wrong slice value\n") 92 | } 93 | if len(t2.T2map) != 1 { 94 | t.Fatalf("wrong t2.T2map len %v\n", len(t2.T2map)) 95 | } 96 | if t2.T2map[99].T1string1 != "y" { 97 | t.Fatalf("wrong map value\n") 98 | } 99 | t3 := (t2.T2t3).(T3) 100 | if t3.T3int999 != 999 { 101 | t.Fatalf("wrong t2.T2t3.T3int999\n") 102 | } 103 | } 104 | 105 | if errorCount != e0 { 106 | t.Fatalf("there were errors, but should not have been") 107 | } 108 | } 109 | 110 | type T4 struct { 111 | Yes int 112 | no int 113 | } 114 | 115 | // 116 | // make sure we check capitalization 117 | // labgob prints one warning during this test. 118 | // 119 | func TestCapital(t *testing.T) { 120 | e0 := errorCount 121 | 122 | v := []map[*T4]int{} 123 | 124 | w := new(bytes.Buffer) 125 | e := NewEncoder(w) 126 | e.Encode(v) 127 | data := w.Bytes() 128 | 129 | var v1 []map[T4]int 130 | r := bytes.NewBuffer(data) 131 | d := NewDecoder(r) 132 | d.Decode(&v1) 133 | 134 | if errorCount != e0+1 { 135 | t.Fatalf("failed to warn about lower-case field") 136 | } 137 | } 138 | 139 | // 140 | // check that we warn when someone sends a default value over 141 | // RPC but the target into which we're decoding holds a non-default 142 | // value, which GOB seems not to overwrite as you'd expect. 143 | // 144 | // labgob does not print a warning. 145 | // 146 | func TestDefault(t *testing.T) { 147 | e0 := errorCount 148 | 149 | type DD struct { 150 | X int 151 | } 152 | 153 | // send a default value... 154 | dd1 := DD{} 155 | 156 | w := new(bytes.Buffer) 157 | e := NewEncoder(w) 158 | e.Encode(dd1) 159 | data := w.Bytes() 160 | 161 | // and receive it into memory that already 162 | // holds non-default values. 163 | reply := DD{99} 164 | 165 | r := bytes.NewBuffer(data) 166 | d := NewDecoder(r) 167 | d.Decode(&reply) 168 | 169 | if errorCount != e0+1 { 170 | t.Fatalf("failed to warn about decoding into non-default value") 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /Raft/code/labrpc/labrpc.go: -------------------------------------------------------------------------------- 1 | package labrpc 2 | 3 | // 4 | // channel-based RPC, for 824 labs. 5 | // 6 | // simulates a network that can lose requests, lose replies, 7 | // delay messages, and entirely disconnect particular hosts. 8 | // 9 | // we will use the original labrpc.go to test your code for grading. 10 | // so, while you can modify this code to help you debug, please 11 | // test against the original before submitting. 12 | // 13 | // adapted from Go net/rpc/server.go. 14 | // 15 | // sends labgob-encoded values to ensure that RPCs 16 | // don't include references to program objects. 17 | // 18 | // net := MakeNetwork() -- holds network, clients, servers. 19 | // end := net.MakeEnd(endname) -- create a client end-point, to talk to one server. 20 | // net.AddServer(servername, server) -- adds a named server to network. 21 | // net.DeleteServer(servername) -- eliminate the named server. 22 | // net.Connect(endname, servername) -- connect a client to a server. 23 | // net.Enable(endname, enabled) -- enable/disable a client. 24 | // net.Reliable(bool) -- false means drop/delay messages 25 | // 26 | // end.Call("Raft.AppendEntries", &args, &reply) -- send an RPC, wait for reply. 27 | // the "Raft" is the name of the server struct to be called. 28 | // the "AppendEntries" is the name of the method to be called. 29 | // Call() returns true to indicate that the server executed the request 30 | // and the reply is valid. 31 | // Call() returns false if the network lost the request or reply 32 | // or the server is down. 33 | // It is OK to have multiple Call()s in progress at the same time on the 34 | // same ClientEnd. 35 | // Concurrent calls to Call() may be delivered to the server out of order, 36 | // since the network may re-order messages. 37 | // Call() is guaranteed to return (perhaps after a delay) *except* if the 38 | // handler function on the server side does not return. 39 | // the server RPC handler function must declare its args and reply arguments 40 | // as pointers, so that their types exactly match the types of the arguments 41 | // to Call(). 42 | // 43 | // srv := MakeServer() 44 | // srv.AddService(svc) -- a server can have multiple services, e.g. Raft and k/v 45 | // pass srv to net.AddServer() 46 | // 47 | // svc := MakeService(receiverObject) -- obj's methods will handle RPCs 48 | // much like Go's rpcs.Register() 49 | // pass svc to srv.AddService() 50 | // 51 | 52 | import "github.com/aQuaYi/Distributed-Algorithms/Raft/code/labgob" 53 | import "bytes" 54 | import "reflect" 55 | import "sync" 56 | import "log" 57 | import "strings" 58 | import "math/rand" 59 | import "time" 60 | import "sync/atomic" 61 | 62 | type reqMsg struct { 63 | endname interface{} // name of sending ClientEnd 64 | svcMeth string // e.g. "Raft.AppendEntries" 65 | argsType reflect.Type 66 | args []byte 67 | replyCh chan replyMsg 68 | } 69 | 70 | type replyMsg struct { 71 | ok bool 72 | reply []byte 73 | } 74 | 75 | // ClientEnd is 76 | type ClientEnd struct { 77 | endname interface{} // this end-point's name 78 | ch chan reqMsg // copy of Network.endCh 79 | done chan struct{} // closed when Network is cleaned up 80 | } 81 | 82 | // Call is 83 | // send an RPC, wait for the reply. 84 | // the return value indicates success; false means that 85 | // no reply was received from the server. 86 | func (e *ClientEnd) Call(svcMeth string, args interface{}, reply interface{}) bool { 87 | req := reqMsg{} 88 | req.endname = e.endname 89 | req.svcMeth = svcMeth 90 | req.argsType = reflect.TypeOf(args) 91 | req.replyCh = make(chan replyMsg) 92 | 93 | qb := new(bytes.Buffer) 94 | qe := labgob.NewEncoder(qb) 95 | qe.Encode(args) 96 | req.args = qb.Bytes() 97 | 98 | select { 99 | case e.ch <- req: 100 | // ok 101 | case <-e.done: 102 | return false 103 | } 104 | 105 | rep := <-req.replyCh 106 | if rep.ok { 107 | rb := bytes.NewBuffer(rep.reply) 108 | rd := labgob.NewDecoder(rb) 109 | if err := rd.Decode(reply); err != nil { 110 | log.Fatalf("ClientEnd.Call(): decode reply: %v\n", err) 111 | } 112 | return true 113 | } else { 114 | return false 115 | } 116 | } 117 | 118 | type Network struct { 119 | mu sync.Mutex 120 | reliable bool 121 | longDelays bool // pause a long time on send on disabled connection 122 | longReordering bool // sometimes delay replies a long time 123 | ends map[interface{}]*ClientEnd // ends, by name 124 | enabled map[interface{}]bool // by end name 125 | servers map[interface{}]*Server // servers, by name 126 | connections map[interface{}]interface{} // endname -> servername 127 | endCh chan reqMsg 128 | done chan struct{} // closed when Network is cleaned up 129 | count int32 // total RPC count, for statistics 130 | } 131 | 132 | func MakeNetwork() *Network { 133 | rn := &Network{} 134 | rn.reliable = true 135 | rn.ends = map[interface{}]*ClientEnd{} 136 | rn.enabled = map[interface{}]bool{} 137 | rn.servers = map[interface{}]*Server{} 138 | rn.connections = map[interface{}](interface{}){} 139 | rn.endCh = make(chan reqMsg) 140 | rn.done = make(chan struct{}) 141 | 142 | // single goroutine to handle all ClientEnd.Call()s 143 | go func() { 144 | for { 145 | select { 146 | case xReq := <-rn.endCh: 147 | atomic.AddInt32(&rn.count, 1) 148 | go rn.ProcessReq(xReq) 149 | case <-rn.done: 150 | return 151 | } 152 | } 153 | }() 154 | 155 | return rn 156 | } 157 | 158 | func (rn *Network) Cleanup() { 159 | close(rn.done) 160 | } 161 | 162 | func (rn *Network) Reliable(yes bool) { 163 | rn.mu.Lock() 164 | defer rn.mu.Unlock() 165 | 166 | rn.reliable = yes 167 | } 168 | 169 | func (rn *Network) LongReordering(yes bool) { 170 | rn.mu.Lock() 171 | defer rn.mu.Unlock() 172 | 173 | rn.longReordering = yes 174 | } 175 | 176 | func (rn *Network) LongDelays(yes bool) { 177 | rn.mu.Lock() 178 | defer rn.mu.Unlock() 179 | 180 | rn.longDelays = yes 181 | } 182 | 183 | func (rn *Network) ReadEndnameInfo(endname interface{}) (enabled bool, 184 | servername interface{}, server *Server, reliable bool, longreordering bool, 185 | ) { 186 | rn.mu.Lock() 187 | defer rn.mu.Unlock() 188 | 189 | enabled = rn.enabled[endname] 190 | servername = rn.connections[endname] 191 | if servername != nil { 192 | server = rn.servers[servername] 193 | } 194 | reliable = rn.reliable 195 | longreordering = rn.longReordering 196 | return 197 | } 198 | 199 | func (rn *Network) IsServerDead(endname interface{}, servername interface{}, server *Server) bool { 200 | rn.mu.Lock() 201 | defer rn.mu.Unlock() 202 | 203 | if rn.enabled[endname] == false || rn.servers[servername] != server { 204 | return true 205 | } 206 | return false 207 | } 208 | 209 | func (rn *Network) ProcessReq(req reqMsg) { 210 | enabled, servername, server, reliable, longreordering := rn.ReadEndnameInfo(req.endname) 211 | 212 | if enabled && servername != nil && server != nil { 213 | if reliable == false { 214 | // short delay 215 | ms := (rand.Int() % 27) 216 | time.Sleep(time.Duration(ms) * time.Millisecond) 217 | } 218 | 219 | if reliable == false && (rand.Int()%1000) < 100 { 220 | // drop the request, return as if timeout 221 | req.replyCh <- replyMsg{false, nil} 222 | return 223 | } 224 | 225 | // execute the request (call the RPC handler). 226 | // in a separate thread so that we can periodically check 227 | // if the server has been killed and the RPC should get a 228 | // failure reply. 229 | ech := make(chan replyMsg) 230 | go func() { 231 | r := server.dispatch(req) 232 | ech <- r 233 | }() 234 | 235 | // wait for handler to return, 236 | // but stop waiting if DeleteServer() has been called, 237 | // and return an error. 238 | var reply replyMsg 239 | replyOK := false 240 | serverDead := false 241 | for replyOK == false && serverDead == false { 242 | select { 243 | case reply = <-ech: 244 | replyOK = true 245 | case <-time.After(100 * time.Millisecond): 246 | serverDead = rn.IsServerDead(req.endname, servername, server) 247 | if serverDead { 248 | go func() { 249 | <-ech // drain channel to let the goroutine created earlier terminate 250 | }() 251 | } 252 | } 253 | } 254 | 255 | // do not reply if DeleteServer() has been called, i.e. 256 | // the server has been killed. this is needed to avoid 257 | // situation in which a client gets a positive reply 258 | // to an Append, but the server persisted the update 259 | // into the old Persister. config.go is careful to call 260 | // DeleteServer() before superseding the Persister. 261 | serverDead = rn.IsServerDead(req.endname, servername, server) 262 | 263 | if replyOK == false || serverDead == true { 264 | // server was killed while we were waiting; return error. 265 | req.replyCh <- replyMsg{false, nil} 266 | } else if reliable == false && (rand.Int()%1000) < 100 { 267 | // drop the reply, return as if timeout 268 | req.replyCh <- replyMsg{false, nil} 269 | } else if longreordering == true && rand.Intn(900) < 600 { 270 | // delay the response for a while 271 | ms := 200 + rand.Intn(1+rand.Intn(2000)) 272 | // Russ points out that this timer arrangement will decrease 273 | // the number of goroutines, so that the race 274 | // detector is less likely to get upset. 275 | time.AfterFunc(time.Duration(ms)*time.Millisecond, func() { 276 | req.replyCh <- reply 277 | }) 278 | } else { 279 | req.replyCh <- reply 280 | } 281 | } else { 282 | // simulate no reply and eventual timeout. 283 | ms := 0 284 | if rn.longDelays { 285 | // let Raft tests check that leader doesn't send 286 | // RPCs synchronously. 287 | ms = (rand.Int() % 7000) 288 | } else { 289 | // many kv tests require the client to try each 290 | // server in fairly rapid succession. 291 | ms = (rand.Int() % 100) 292 | } 293 | time.AfterFunc(time.Duration(ms)*time.Millisecond, func() { 294 | req.replyCh <- replyMsg{false, nil} 295 | }) 296 | } 297 | 298 | } 299 | 300 | // create a client end-point. 301 | // start the thread that listens and delivers. 302 | func (rn *Network) MakeEnd(endname interface{}) *ClientEnd { 303 | rn.mu.Lock() 304 | defer rn.mu.Unlock() 305 | 306 | if _, ok := rn.ends[endname]; ok { 307 | log.Fatalf("MakeEnd: %v already exists\n", endname) 308 | } 309 | 310 | e := &ClientEnd{} 311 | e.endname = endname 312 | e.ch = rn.endCh 313 | e.done = rn.done 314 | rn.ends[endname] = e 315 | rn.enabled[endname] = false 316 | rn.connections[endname] = nil 317 | 318 | return e 319 | } 320 | 321 | func (rn *Network) AddServer(servername interface{}, rs *Server) { 322 | rn.mu.Lock() 323 | defer rn.mu.Unlock() 324 | 325 | rn.servers[servername] = rs 326 | } 327 | 328 | func (rn *Network) DeleteServer(servername interface{}) { 329 | rn.mu.Lock() 330 | defer rn.mu.Unlock() 331 | 332 | rn.servers[servername] = nil 333 | } 334 | 335 | // connect a ClientEnd to a server. 336 | // a ClientEnd can only be connected once in its lifetime. 337 | func (rn *Network) Connect(endname interface{}, servername interface{}) { 338 | rn.mu.Lock() 339 | defer rn.mu.Unlock() 340 | 341 | rn.connections[endname] = servername 342 | } 343 | 344 | // enable/disable a ClientEnd. 345 | func (rn *Network) Enable(endname interface{}, enabled bool) { 346 | rn.mu.Lock() 347 | defer rn.mu.Unlock() 348 | 349 | rn.enabled[endname] = enabled 350 | } 351 | 352 | // get a server's count of incoming RPCs. 353 | func (rn *Network) GetCount(servername interface{}) int { 354 | rn.mu.Lock() 355 | defer rn.mu.Unlock() 356 | 357 | svr := rn.servers[servername] 358 | return svr.GetCount() 359 | } 360 | 361 | func (rn *Network) GetTotalCount() int { 362 | x := atomic.LoadInt32(&rn.count) 363 | return int(x) 364 | } 365 | 366 | // 367 | // a server is a collection of services, all sharing 368 | // the same rpc dispatcher. so that e.g. both a Raft 369 | // and a k/v server can listen to the same rpc endpoint. 370 | // 371 | type Server struct { 372 | mu sync.Mutex 373 | services map[string]*Service 374 | count int // incoming RPCs 375 | } 376 | 377 | func MakeServer() *Server { 378 | rs := &Server{} 379 | rs.services = map[string]*Service{} 380 | return rs 381 | } 382 | 383 | func (rs *Server) AddService(svc *Service) { 384 | rs.mu.Lock() 385 | defer rs.mu.Unlock() 386 | rs.services[svc.name] = svc 387 | } 388 | 389 | func (rs *Server) dispatch(req reqMsg) replyMsg { 390 | rs.mu.Lock() 391 | 392 | rs.count += 1 393 | 394 | // split Raft.AppendEntries into service and method 395 | dot := strings.LastIndex(req.svcMeth, ".") 396 | serviceName := req.svcMeth[:dot] 397 | methodName := req.svcMeth[dot+1:] 398 | 399 | service, ok := rs.services[serviceName] 400 | 401 | rs.mu.Unlock() 402 | 403 | if ok { 404 | return service.dispatch(methodName, req) 405 | } else { 406 | choices := []string{} 407 | for k, _ := range rs.services { 408 | choices = append(choices, k) 409 | } 410 | log.Fatalf("labrpc.Server.dispatch(): unknown service %v in %v.%v; expecting one of %v\n", 411 | serviceName, serviceName, methodName, choices) 412 | return replyMsg{false, nil} 413 | } 414 | } 415 | 416 | // GetCount is 417 | func (rs *Server) GetCount() int { 418 | rs.mu.Lock() 419 | defer rs.mu.Unlock() 420 | return rs.count 421 | } 422 | 423 | // Service is 424 | // an object with methods that can be called via RPC. 425 | // a single server may have more than one Service. 426 | type Service struct { 427 | name string 428 | rcvr reflect.Value 429 | typ reflect.Type 430 | methods map[string]reflect.Method 431 | } 432 | 433 | // MakeService is 434 | func MakeService(rcvr interface{}) *Service { 435 | svc := &Service{} 436 | svc.typ = reflect.TypeOf(rcvr) 437 | svc.rcvr = reflect.ValueOf(rcvr) 438 | svc.name = reflect.Indirect(svc.rcvr).Type().Name() 439 | svc.methods = map[string]reflect.Method{} 440 | 441 | for m := 0; m < svc.typ.NumMethod(); m++ { 442 | method := svc.typ.Method(m) 443 | mType := method.Type 444 | mName := method.Name 445 | 446 | //fmt.Printf("%v pp %v ni %v 1k %v 2k %v no %v\n", 447 | // mName, method.PkgPath, mType.NumIn(), mType.In(1).Kind(), mType.In(2).Kind(), mType.NumOut()) 448 | 449 | if method.PkgPath != "" || // capitalized? 450 | mType.NumIn() != 3 || 451 | //mType.In(1).Kind() != reflect.Ptr || 452 | mType.In(2).Kind() != reflect.Ptr || 453 | mType.NumOut() != 0 { 454 | // the method is not suitable for a handler 455 | //fmt.Printf("bad method: %v\n", mName) 456 | } else { 457 | // the method looks like a handler 458 | svc.methods[mName] = method 459 | } 460 | } 461 | 462 | return svc 463 | } 464 | 465 | func (svc *Service) dispatch(methname string, req reqMsg) replyMsg { 466 | if method, ok := svc.methods[methname]; ok { 467 | // prepare space into which to read the argument. 468 | // the Value's type will be a pointer to req.argsType. 469 | args := reflect.New(req.argsType) 470 | 471 | // decode the argument. 472 | ab := bytes.NewBuffer(req.args) 473 | ad := labgob.NewDecoder(ab) 474 | ad.Decode(args.Interface()) 475 | 476 | // allocate space for the reply. 477 | replyType := method.Type.In(2) 478 | replyType = replyType.Elem() 479 | replyv := reflect.New(replyType) 480 | 481 | // call the method. 482 | function := method.Func 483 | function.Call([]reflect.Value{svc.rcvr, args.Elem(), replyv}) 484 | 485 | // encode the reply. 486 | rb := new(bytes.Buffer) 487 | re := labgob.NewEncoder(rb) 488 | re.EncodeValue(replyv) 489 | 490 | return replyMsg{true, rb.Bytes()} 491 | } else { 492 | choices := []string{} 493 | for k := range svc.methods { 494 | choices = append(choices, k) 495 | } 496 | log.Fatalf("labrpc.Service.dispatch(): unknown method %v in %v; expecting one of %v\n", 497 | methname, req.svcMeth, choices) 498 | return replyMsg{false, nil} 499 | } 500 | } 501 | -------------------------------------------------------------------------------- /Raft/code/labrpc/test_test.go: -------------------------------------------------------------------------------- 1 | package labrpc 2 | 3 | import "testing" 4 | import "strconv" 5 | import "sync" 6 | import "runtime" 7 | import "time" 8 | import "fmt" 9 | 10 | type JunkArgs struct { 11 | X int 12 | } 13 | type JunkReply struct { 14 | X string 15 | } 16 | 17 | type JunkServer struct { 18 | mu sync.Mutex 19 | log1 []string 20 | log2 []int 21 | } 22 | 23 | func (js *JunkServer) Handler1(args string, reply *int) { 24 | js.mu.Lock() 25 | defer js.mu.Unlock() 26 | js.log1 = append(js.log1, args) 27 | *reply, _ = strconv.Atoi(args) 28 | } 29 | 30 | func (js *JunkServer) Handler2(args int, reply *string) { 31 | js.mu.Lock() 32 | defer js.mu.Unlock() 33 | js.log2 = append(js.log2, args) 34 | *reply = "handler2-" + strconv.Itoa(args) 35 | } 36 | 37 | func (js *JunkServer) Handler3(args int, reply *int) { 38 | js.mu.Lock() 39 | defer js.mu.Unlock() 40 | time.Sleep(20 * time.Second) 41 | *reply = -args 42 | } 43 | 44 | // args is a pointer 45 | func (js *JunkServer) Handler4(args *JunkArgs, reply *JunkReply) { 46 | reply.X = "pointer" 47 | } 48 | 49 | // args is a not pointer 50 | func (js *JunkServer) Handler5(args JunkArgs, reply *JunkReply) { 51 | reply.X = "no pointer" 52 | } 53 | 54 | func TestBasic(t *testing.T) { 55 | runtime.GOMAXPROCS(4) 56 | 57 | rn := MakeNetwork() 58 | defer rn.Cleanup() 59 | 60 | e := rn.MakeEnd("end1-99") 61 | 62 | js := &JunkServer{} 63 | svc := MakeService(js) 64 | 65 | rs := MakeServer() 66 | rs.AddService(svc) 67 | rn.AddServer("server99", rs) 68 | 69 | rn.Connect("end1-99", "server99") 70 | rn.Enable("end1-99", true) 71 | 72 | { 73 | reply := "" 74 | e.Call("JunkServer.Handler2", 111, &reply) 75 | if reply != "handler2-111" { 76 | t.Fatalf("wrong reply from Handler2") 77 | } 78 | } 79 | 80 | { 81 | reply := 0 82 | e.Call("JunkServer.Handler1", "9099", &reply) 83 | if reply != 9099 { 84 | t.Fatalf("wrong reply from Handler1") 85 | } 86 | } 87 | } 88 | 89 | func TestTypes(t *testing.T) { 90 | runtime.GOMAXPROCS(4) 91 | 92 | rn := MakeNetwork() 93 | defer rn.Cleanup() 94 | 95 | e := rn.MakeEnd("end1-99") 96 | 97 | js := &JunkServer{} 98 | svc := MakeService(js) 99 | 100 | rs := MakeServer() 101 | rs.AddService(svc) 102 | rn.AddServer("server99", rs) 103 | 104 | rn.Connect("end1-99", "server99") 105 | rn.Enable("end1-99", true) 106 | 107 | { 108 | var args JunkArgs 109 | var reply JunkReply 110 | // args must match type (pointer or not) of handler. 111 | e.Call("JunkServer.Handler4", &args, &reply) 112 | if reply.X != "pointer" { 113 | t.Fatalf("wrong reply from Handler4") 114 | } 115 | } 116 | 117 | { 118 | var args JunkArgs 119 | var reply JunkReply 120 | // args must match type (pointer or not) of handler. 121 | e.Call("JunkServer.Handler5", args, &reply) 122 | if reply.X != "no pointer" { 123 | t.Fatalf("wrong reply from Handler5") 124 | } 125 | } 126 | } 127 | 128 | // 129 | // does net.Enable(endname, false) really disconnect a client? 130 | // 131 | func TestDisconnect(t *testing.T) { 132 | runtime.GOMAXPROCS(4) 133 | 134 | rn := MakeNetwork() 135 | defer rn.Cleanup() 136 | 137 | e := rn.MakeEnd("end1-99") 138 | 139 | js := &JunkServer{} 140 | svc := MakeService(js) 141 | 142 | rs := MakeServer() 143 | rs.AddService(svc) 144 | rn.AddServer("server99", rs) 145 | 146 | rn.Connect("end1-99", "server99") 147 | 148 | { 149 | reply := "" 150 | e.Call("JunkServer.Handler2", 111, &reply) 151 | if reply != "" { 152 | t.Fatalf("unexpected reply from Handler2") 153 | } 154 | } 155 | 156 | rn.Enable("end1-99", true) 157 | 158 | { 159 | reply := 0 160 | e.Call("JunkServer.Handler1", "9099", &reply) 161 | if reply != 9099 { 162 | t.Fatalf("wrong reply from Handler1") 163 | } 164 | } 165 | } 166 | 167 | // 168 | // test net.GetCount() 169 | // 170 | func TestCounts(t *testing.T) { 171 | runtime.GOMAXPROCS(4) 172 | 173 | rn := MakeNetwork() 174 | defer rn.Cleanup() 175 | 176 | e := rn.MakeEnd("end1-99") 177 | 178 | js := &JunkServer{} 179 | svc := MakeService(js) 180 | 181 | rs := MakeServer() 182 | rs.AddService(svc) 183 | rn.AddServer(99, rs) 184 | 185 | rn.Connect("end1-99", 99) 186 | rn.Enable("end1-99", true) 187 | 188 | for i := 0; i < 17; i++ { 189 | reply := "" 190 | e.Call("JunkServer.Handler2", i, &reply) 191 | wanted := "handler2-" + strconv.Itoa(i) 192 | if reply != wanted { 193 | t.Fatalf("wrong reply %v from Handler1, expecting %v", reply, wanted) 194 | } 195 | } 196 | 197 | n := rn.GetCount(99) 198 | if n != 17 { 199 | t.Fatalf("wrong GetCount() %v, expected 17\n", n) 200 | } 201 | } 202 | 203 | // 204 | // test RPCs from concurrent ClientEnds 205 | // 206 | func TestConcurrentMany(t *testing.T) { 207 | runtime.GOMAXPROCS(4) 208 | 209 | rn := MakeNetwork() 210 | defer rn.Cleanup() 211 | 212 | js := &JunkServer{} 213 | svc := MakeService(js) 214 | 215 | rs := MakeServer() 216 | rs.AddService(svc) 217 | rn.AddServer(1000, rs) 218 | 219 | ch := make(chan int) 220 | 221 | nClients := 20 222 | nRPCs := 10 223 | for ii := 0; ii < nClients; ii++ { 224 | go func(i int) { 225 | n := 0 226 | defer func() { ch <- n }() 227 | 228 | e := rn.MakeEnd(i) 229 | rn.Connect(i, 1000) 230 | rn.Enable(i, true) 231 | 232 | for j := 0; j < nRPCs; j++ { 233 | arg := i*100 + j 234 | reply := "" 235 | e.Call("JunkServer.Handler2", arg, &reply) 236 | wanted := "handler2-" + strconv.Itoa(arg) 237 | if reply != wanted { 238 | t.Fatalf("wrong reply %v from Handler1, expecting %v", reply, wanted) 239 | } 240 | n++ 241 | } 242 | }(ii) 243 | } 244 | 245 | total := 0 246 | for ii := 0; ii < nClients; ii++ { 247 | x := <-ch 248 | total += x 249 | } 250 | 251 | if total != nClients*nRPCs { 252 | t.Fatalf("wrong number of RPCs completed, got %v, expected %v", total, nClients*nRPCs) 253 | } 254 | 255 | n := rn.GetCount(1000) 256 | if n != total { 257 | t.Fatalf("wrong GetCount() %v, expected %v\n", n, total) 258 | } 259 | } 260 | 261 | // 262 | // test unreliable 263 | // 264 | func TestUnreliable(t *testing.T) { 265 | runtime.GOMAXPROCS(4) 266 | 267 | rn := MakeNetwork() 268 | defer rn.Cleanup() 269 | rn.Reliable(false) 270 | 271 | js := &JunkServer{} 272 | svc := MakeService(js) 273 | 274 | rs := MakeServer() 275 | rs.AddService(svc) 276 | rn.AddServer(1000, rs) 277 | 278 | ch := make(chan int) 279 | 280 | nClients := 300 281 | for ii := 0; ii < nClients; ii++ { 282 | go func(i int) { 283 | n := 0 284 | defer func() { ch <- n }() 285 | 286 | e := rn.MakeEnd(i) 287 | rn.Connect(i, 1000) 288 | rn.Enable(i, true) 289 | 290 | arg := i * 100 291 | reply := "" 292 | ok := e.Call("JunkServer.Handler2", arg, &reply) 293 | if ok { 294 | wanted := "handler2-" + strconv.Itoa(arg) 295 | if reply != wanted { 296 | t.Fatalf("wrong reply %v from Handler1, expecting %v", reply, wanted) 297 | } 298 | n++ 299 | } 300 | }(ii) 301 | } 302 | 303 | total := 0 304 | for ii := 0; ii < nClients; ii++ { 305 | x := <-ch 306 | total += x 307 | } 308 | 309 | if total == nClients || total == 0 { 310 | t.Fatalf("all RPCs succeeded despite unreliable") 311 | } 312 | } 313 | 314 | // 315 | // test concurrent RPCs from a single ClientEnd 316 | // 317 | func TestConcurrentOne(t *testing.T) { 318 | runtime.GOMAXPROCS(4) 319 | 320 | rn := MakeNetwork() 321 | defer rn.Cleanup() 322 | 323 | js := &JunkServer{} 324 | svc := MakeService(js) 325 | 326 | rs := MakeServer() 327 | rs.AddService(svc) 328 | rn.AddServer(1000, rs) 329 | 330 | e := rn.MakeEnd("c") 331 | rn.Connect("c", 1000) 332 | rn.Enable("c", true) 333 | 334 | ch := make(chan int) 335 | 336 | nRPCs := 20 337 | for ii := 0; ii < nRPCs; ii++ { 338 | go func(i int) { 339 | n := 0 340 | defer func() { ch <- n }() 341 | 342 | arg := 100 + i 343 | reply := "" 344 | e.Call("JunkServer.Handler2", arg, &reply) 345 | wanted := "handler2-" + strconv.Itoa(arg) 346 | if reply != wanted { 347 | t.Fatalf("wrong reply %v from Handler2, expecting %v", reply, wanted) 348 | } 349 | n++ 350 | }(ii) 351 | } 352 | 353 | total := 0 354 | for ii := 0; ii < nRPCs; ii++ { 355 | x := <-ch 356 | total += x 357 | } 358 | 359 | if total != nRPCs { 360 | t.Fatalf("wrong number of RPCs completed, got %v, expected %v", total, nRPCs) 361 | } 362 | 363 | js.mu.Lock() 364 | defer js.mu.Unlock() 365 | if len(js.log2) != nRPCs { 366 | t.Fatalf("wrong number of RPCs delivered") 367 | } 368 | 369 | n := rn.GetCount(1000) 370 | if n != total { 371 | t.Fatalf("wrong GetCount() %v, expected %v\n", n, total) 372 | } 373 | } 374 | 375 | // 376 | // regression: an RPC that's delayed during Enabled=false 377 | // should not delay subsequent RPCs (e.g. after Enabled=true). 378 | // 379 | func TestRegression1(t *testing.T) { 380 | runtime.GOMAXPROCS(4) 381 | 382 | rn := MakeNetwork() 383 | defer rn.Cleanup() 384 | 385 | js := &JunkServer{} 386 | svc := MakeService(js) 387 | 388 | rs := MakeServer() 389 | rs.AddService(svc) 390 | rn.AddServer(1000, rs) 391 | 392 | e := rn.MakeEnd("c") 393 | rn.Connect("c", 1000) 394 | 395 | // start some RPCs while the ClientEnd is disabled. 396 | // they'll be delayed. 397 | rn.Enable("c", false) 398 | ch := make(chan bool) 399 | nRPCs := 20 400 | for ii := 0; ii < nRPCs; ii++ { 401 | go func(i int) { 402 | ok := false 403 | defer func() { ch <- ok }() 404 | 405 | arg := 100 + i 406 | reply := "" 407 | // this call ought to return false. 408 | e.Call("JunkServer.Handler2", arg, &reply) 409 | ok = true 410 | }(ii) 411 | } 412 | 413 | time.Sleep(100 * time.Millisecond) 414 | 415 | // now enable the ClientEnd and check that an RPC completes quickly. 416 | t0 := time.Now() 417 | rn.Enable("c", true) 418 | { 419 | arg := 99 420 | reply := "" 421 | e.Call("JunkServer.Handler2", arg, &reply) 422 | wanted := "handler2-" + strconv.Itoa(arg) 423 | if reply != wanted { 424 | t.Fatalf("wrong reply %v from Handler2, expecting %v", reply, wanted) 425 | } 426 | } 427 | dur := time.Since(t0).Seconds() 428 | 429 | if dur > 0.03 { 430 | t.Fatalf("RPC took too long (%v) after Enable", dur) 431 | } 432 | 433 | for ii := 0; ii < nRPCs; ii++ { 434 | <-ch 435 | } 436 | 437 | js.mu.Lock() 438 | defer js.mu.Unlock() 439 | if len(js.log2) != 1 { 440 | t.Fatalf("wrong number (%v) of RPCs delivered, expected 1", len(js.log2)) 441 | } 442 | 443 | n := rn.GetCount(1000) 444 | if n != 1 { 445 | t.Fatalf("wrong GetCount() %v, expected %v\n", n, 1) 446 | } 447 | } 448 | 449 | // 450 | // if an RPC is stuck in a server, and the server 451 | // is killed with DeleteServer(), does the RPC 452 | // get un-stuck? 453 | // 454 | func TestKilled(t *testing.T) { 455 | runtime.GOMAXPROCS(4) 456 | 457 | rn := MakeNetwork() 458 | defer rn.Cleanup() 459 | 460 | e := rn.MakeEnd("end1-99") 461 | 462 | js := &JunkServer{} 463 | svc := MakeService(js) 464 | 465 | rs := MakeServer() 466 | rs.AddService(svc) 467 | rn.AddServer("server99", rs) 468 | 469 | rn.Connect("end1-99", "server99") 470 | rn.Enable("end1-99", true) 471 | 472 | doneCh := make(chan bool) 473 | go func() { 474 | reply := 0 475 | ok := e.Call("JunkServer.Handler3", 99, &reply) 476 | doneCh <- ok 477 | }() 478 | 479 | time.Sleep(1000 * time.Millisecond) 480 | 481 | select { 482 | case <-doneCh: 483 | t.Fatalf("Handler3 should not have returned yet") 484 | case <-time.After(100 * time.Millisecond): 485 | } 486 | 487 | rn.DeleteServer("server99") 488 | 489 | select { 490 | case x := <-doneCh: 491 | if x != false { 492 | t.Fatalf("Handler3 returned successfully despite DeleteServer()") 493 | } 494 | case <-time.After(100 * time.Millisecond): 495 | t.Fatalf("Handler3 should return after DeleteServer()") 496 | } 497 | } 498 | 499 | func TestBenchmark(t *testing.T) { 500 | runtime.GOMAXPROCS(4) 501 | 502 | rn := MakeNetwork() 503 | defer rn.Cleanup() 504 | 505 | e := rn.MakeEnd("end1-99") 506 | 507 | js := &JunkServer{} 508 | svc := MakeService(js) 509 | 510 | rs := MakeServer() 511 | rs.AddService(svc) 512 | rn.AddServer("server99", rs) 513 | 514 | rn.Connect("end1-99", "server99") 515 | rn.Enable("end1-99", true) 516 | 517 | t0 := time.Now() 518 | n := 100000 519 | for iters := 0; iters < n; iters++ { 520 | reply := "" 521 | e.Call("JunkServer.Handler2", 111, &reply) 522 | if reply != "handler2-111" { 523 | t.Fatalf("wrong reply from Handler2") 524 | } 525 | } 526 | fmt.Printf("%v for %v\n", time.Since(t0), n) 527 | // march 2016, rtm laptop, 22 microseconds per RPC 528 | } 529 | -------------------------------------------------------------------------------- /Raft/code/persister.go: -------------------------------------------------------------------------------- 1 | package raft 2 | 3 | // 4 | // support for Raft and kvraft to save persistent 5 | // Raft state (log &c) and k/v server snapshots. 6 | // 7 | // we will use the original persister.go to test your code for grading. 8 | // so, while you can modify this code to help you debug, please 9 | // test with the original before submitting. 10 | // 11 | 12 | import "sync" 13 | 14 | // Persister is 15 | type Persister struct { 16 | mu sync.Mutex 17 | raftstate []byte 18 | snapshot []byte 19 | } 20 | 21 | // MakePersister is 22 | func MakePersister() *Persister { 23 | return &Persister{} 24 | } 25 | 26 | // Copy is 27 | func (ps *Persister) Copy() *Persister { 28 | ps.mu.Lock() 29 | defer ps.mu.Unlock() 30 | np := MakePersister() 31 | np.raftstate = ps.raftstate 32 | np.snapshot = ps.snapshot 33 | return np 34 | } 35 | 36 | // SaveRaftState is 37 | func (ps *Persister) SaveRaftState(state []byte) { 38 | ps.mu.Lock() 39 | defer ps.mu.Unlock() 40 | ps.raftstate = state 41 | } 42 | 43 | // ReadRaftState is 44 | func (ps *Persister) ReadRaftState() []byte { 45 | ps.mu.Lock() 46 | defer ps.mu.Unlock() 47 | return ps.raftstate 48 | } 49 | 50 | // RaftStateSize is 51 | func (ps *Persister) RaftStateSize() int { 52 | ps.mu.Lock() 53 | defer ps.mu.Unlock() 54 | return len(ps.raftstate) 55 | } 56 | 57 | // SaveStateAndSnapshot is 58 | // Save both Raft state and K/V snapshot as a single atomic action, 59 | // to help avoid them getting out of sync. 60 | func (ps *Persister) SaveStateAndSnapshot(state []byte, snapshot []byte) { 61 | ps.mu.Lock() 62 | defer ps.mu.Unlock() 63 | ps.raftstate = state 64 | ps.snapshot = snapshot 65 | } 66 | 67 | // ReadSnapshot is 68 | func (ps *Persister) ReadSnapshot() []byte { 69 | ps.mu.Lock() 70 | defer ps.mu.Unlock() 71 | return ps.snapshot 72 | } 73 | 74 | // SnapshotSize is 75 | func (ps *Persister) SnapshotSize() int { 76 | ps.mu.Lock() 77 | defer ps.mu.Unlock() 78 | return len(ps.snapshot) 79 | } 80 | -------------------------------------------------------------------------------- /Raft/code/raft-API.go: -------------------------------------------------------------------------------- 1 | package raft 2 | 3 | import ( 4 | "fmt" 5 | "time" 6 | 7 | "github.com/aQuaYi/Distributed-Algorithms/Raft/code/labrpc" 8 | ) 9 | 10 | /** 11 | * // create a new Raft server instance: 12 | * rf := Make(peers, me, persister, applyCh) 13 | * 14 | * // start agreement on a new log entry: 15 | * rf.Start(command interface{}) (index, term, isLeader) 16 | * 17 | * // ask a Raft for its current term, and whether it thinks it is leader 18 | * rf.GetState() (term, isLeader) 19 | * 20 | * // each time a new entry is committed to the log, each Raft peer 21 | * // should send an ApplyMsg to the service (or tester). 22 | * type ApplyMsg 23 | * 24 | */ 25 | 26 | // Make is 27 | // the service or tester wants to create a Raft server. the ports 28 | // of all the Raft servers (including this one) are in peers[]. this 29 | // server's port is peers[me]. all the servers' peers[] arrays 30 | // have the same order. persister is a place for this server to 31 | // save its persistent state, and also initially holds the most 32 | // recent saved state, if any. applyCh is a channel on which the 33 | // tester or service expects Raft to send ApplyMsg messages. 34 | // Make() must return quickly, so it should start goroutines 35 | // for any long-running work. 36 | // 37 | func Make(peers []*labrpc.ClientEnd, me int, 38 | persister *Persister, applyCh chan ApplyMsg) *Raft { 39 | rf := &Raft{} 40 | 41 | // Make 函数参数的去处 42 | rf.peers = peers 43 | rf.me = me 44 | rf.persister = persister 45 | rf.chanApply = applyCh 46 | 47 | // 需要 persist 的参数 48 | rf.currentTerm = 0 49 | rf.votedFor = NOBODY 50 | le := LogEntry{LogIndex: 0, LogTerm: 0, Command: 0} 51 | rf.logs = append(rf.logs, le) // 在 logs 预先放入一个,方便 Raft.getLastIndex() 52 | 53 | // 私有状态 54 | rf.state = FOLLOWER 55 | // REVIEW: 把这些通道的设置成非缓冲的,看看会不会出错 56 | rf.chanCommit = make(chan struct{}, 100) 57 | rf.chanHeartBeat = make(chan struct{}, 100) 58 | rf.chanBeElected = make(chan struct{}, 100) 59 | 60 | // initialize from state persisted before a crash 61 | rf.readPersist(persister.ReadRaftState()) 62 | 63 | go rf.statesLoop() 64 | 65 | go rf.applyLoop() 66 | 67 | return rf 68 | } 69 | 70 | func (rf *Raft) statesLoop() { 71 | for { 72 | switch rf.state { 73 | case FOLLOWER: 74 | select { 75 | case <-time.After(electionTimeout()): 76 | rf.state = CANDIDATE 77 | case <-rf.chanHeartBeat: 78 | } 79 | case CANDIDATE: 80 | rf.newElection() 81 | case LEADER: 82 | rf.newHeartBeat() 83 | } 84 | } 85 | } 86 | 87 | func (rf *Raft) newElection() { 88 | rf.mu.Lock() 89 | 90 | rf.currentTerm++ 91 | rf.votedFor = rf.me 92 | rf.voteCount = 1 93 | 94 | rf.persist() 95 | rf.mu.Unlock() 96 | 97 | DPrintf("%s begin new election\n", rf) 98 | 99 | go rf.broadcastRequestVote() 100 | 101 | select { 102 | case <-time.After(electionTimeout()): 103 | case <-rf.chanHeartBeat: 104 | rf.state = FOLLOWER 105 | DPrintf("%s receives chanHeartbeat", rf) 106 | case <-rf.chanBeElected: 107 | rf.comeToPower() 108 | } 109 | } 110 | 111 | func (rf *Raft) comeToPower() { 112 | rf.mu.Lock() 113 | rf.state = LEADER 114 | DPrintf("%s is Leader now", rf) 115 | rf.nextIndex = make([]int, len(rf.peers)) 116 | rf.matchIndex = make([]int, len(rf.peers)) 117 | for i := range rf.peers { 118 | rf.nextIndex[i] = rf.getLastIndex() + 1 119 | rf.matchIndex[i] = 0 120 | } 121 | rf.mu.Unlock() 122 | } 123 | 124 | func (rf *Raft) newHeartBeat() { 125 | DPrintf("%s broadcastAppendEntries", rf) 126 | rf.broadcastAppendEntries() 127 | <-time.After(heartBeat) 128 | } 129 | 130 | func (rf *Raft) applyLoop() { 131 | for { 132 | <-rf.chanCommit 133 | DPrintf("%s COMMITTED %s", rf, rf.details()) 134 | // 135 | rf.mu.Lock() 136 | // 137 | commitIndex := rf.commitIndex 138 | baseIndex := rf.getBaseIndex() 139 | for i := rf.lastApplied + 1; i <= commitIndex; i++ { 140 | msg := ApplyMsg{ 141 | CommandValid: true, 142 | CommandIndex: i, 143 | Command: rf.logs[i-baseIndex].Command, 144 | } 145 | rf.chanApply <- msg 146 | DPrintf("%s ApplyMSG: %s %s", rf, msg, rf.details()) 147 | rf.lastApplied = i 148 | } 149 | // 150 | rf.mu.Unlock() 151 | } 152 | } 153 | 154 | // Start is 155 | // the service using Raft (e.g. a k/v server) wants to start 156 | // agreement on the next command to be appended to Raft's log. if this 157 | // server isn't the leader, returns false. otherwise start the 158 | // agreement and 159 | // ** return immediately, without waiting for the log appends to complete. ** 160 | // there is no guarantee that this 161 | // command will ever be committed to the Raft log, since the leader 162 | // may fail or lose an election. even if the Raft instance has been killed, 163 | // this function should return gracefully. 164 | // 165 | // the first return value is the index that the command will appear at 166 | // if it's ever committed. the second return value is the current 167 | // term. the third return value is true if this server believes it is 168 | // the leader. 169 | // 170 | func (rf *Raft) Start(command interface{}) (int, int, bool) { 171 | // Your code here (2B). 172 | rf.mu.Lock() 173 | defer rf.mu.Unlock() 174 | 175 | if !rf.isLeader() { 176 | return -1, -1, false 177 | } 178 | 179 | DPrintf("%s Start %v", rf, command) 180 | 181 | logIndex := rf.getLastIndex() + 1 182 | term := rf.currentTerm 183 | isLeader := rf.isLeader() 184 | 185 | rf.logs = append(rf.logs, 186 | LogEntry{ 187 | LogIndex: logIndex, 188 | LogTerm: term, 189 | Command: command, 190 | }) // append new entry from client 191 | 192 | rf.persist() 193 | 194 | // Your code above 195 | return logIndex, term, isLeader 196 | } 197 | 198 | // GetState is 199 | // return currentTerm and whether this server 200 | // believes it is the leader. 201 | func (rf *Raft) GetState() (int, bool) { 202 | 203 | var term int 204 | var isLeader bool 205 | // Your code here (2A). 206 | 207 | term = rf.currentTerm 208 | isLeader = rf.isLeader() 209 | 210 | // Your code above (2A) 211 | return term, isLeader 212 | } 213 | 214 | // ApplyMsg is 215 | // as each Raft peer becomes aware that successive log entries are 216 | // committed, the peer should send an ApplyMsg to the service (or 217 | // tester) on the same server, via the applyCh passed to Make(). set 218 | // CommandValid to true to indicate that the ApplyMsg contains a newly 219 | // committed log entry. 220 | // 221 | // in Lab 3 you'll want to send other kinds of messages (e.g., 222 | // snapshots) on the applyCh; at that point you can add fields to 223 | // ApplyMsg, but set CommandValid to false for these other uses. 224 | // 225 | type ApplyMsg struct { 226 | CommandValid bool // CommandValid = true 表示, 此消息是用于应用 Command 227 | CommandIndex int // Command 所在的 logEntry.logIndex 值 228 | Command interface{} 229 | } 230 | 231 | func (m ApplyMsg) String() string { 232 | return fmt.Sprintf("ApplyMsg{Valid:%t,Index:%d,Command:%v}", m.CommandValid, m.CommandIndex, m.Command) 233 | } 234 | 235 | // Kill is 236 | // the tester calls Kill() when a Raft instance won't 237 | // be needed again. you are not required to do anything 238 | // in Kill(), but it might be convenient to (for example) 239 | // turn off debug output from this instance. 240 | // 241 | func (rf *Raft) Kill() { 242 | // Your code here, if desired. 243 | } 244 | -------------------------------------------------------------------------------- /Raft/code/raft-AppendEntries.go: -------------------------------------------------------------------------------- 1 | package raft 2 | 3 | import "fmt" 4 | 5 | // AppendEntriesArgs 是添加 log 的参数 6 | type AppendEntriesArgs struct { 7 | Term int // leader.currentTerm 8 | LeaderID int // leader.me 9 | PrevLogIndex int // index of log entry immediately preceding new ones 10 | PrevLogTerm int // term of prevLogIndex entry 11 | LeaderCommit int // leader.commitIndex 12 | Entries []LogEntry // 需要添加的 log 单元 13 | } 14 | 15 | func (a AppendEntriesArgs) String() string { 16 | return fmt.Sprintf("appendEntriesArgs{R%d:T%d, PrevLogIndex:%d, PrevLogTerm:%d, LeaderCommit:%d, entries:%v}", 17 | a.LeaderID, a.Term, a.PrevLogIndex, a.PrevLogTerm, a.LeaderCommit, a.Entries) 18 | } 19 | 20 | func (rf *Raft) newAppendEntriesArgs(server int) AppendEntriesArgs { 21 | prevLogIndex := rf.nextIndex[server] - 1 22 | baseIndex := rf.getBaseIndex() 23 | return AppendEntriesArgs{ 24 | Term: rf.currentTerm, 25 | LeaderID: rf.me, 26 | PrevLogIndex: prevLogIndex, 27 | PrevLogTerm: rf.logs[prevLogIndex-baseIndex].LogTerm, 28 | Entries: rf.logs[prevLogIndex+1-baseIndex:], 29 | LeaderCommit: rf.commitIndex, 30 | } 31 | } 32 | 33 | // AppendEntriesReply 是 flower 回复 leader 的内容 34 | type AppendEntriesReply struct { 35 | Term int // 回复者的 term 36 | Success bool // 返回 true,如果被调用的 rf.logs 真的 append 了 entries 37 | NextIndex int // 下一次发送的 AppendEntriesArgs.Entries[0] 在 Leader.logs 中的索引号 38 | } 39 | 40 | func (r AppendEntriesReply) String() string { 41 | return fmt.Sprintf("appendEntriesReply{T%d, Success:%t, NextIndex:%d}", 42 | r.Term, r.Success, r.NextIndex) 43 | } 44 | 45 | func (rf *Raft) sendAppendEntries(server int, args AppendEntriesArgs, reply *AppendEntriesReply) bool { 46 | return rf.peers[server].Call("Raft.AppendEntries", args, reply) 47 | } 48 | 49 | // 广播 AppendEntries 有两个作用 50 | // 1. heart beat: 阻止其他 server 发起选举 51 | // 2. 同步 log 到其他 server 52 | func (rf *Raft) broadcastAppendEntries() { 53 | rf.mu.Lock() 54 | defer rf.mu.Unlock() 55 | 56 | lastIndex := rf.getLastIndex() 57 | baseIndex := rf.getBaseIndex() 58 | 59 | newCommitIndex := 0 60 | // 统计 leader 的此 term 的已复制 log 数量,超过半数,就可以 commit 了 61 | for idx := rf.commitIndex + 1; idx <= lastIndex; idx++ { 62 | count := 1 // 1 是 rf 自己的一票 63 | for id := range rf.peers { 64 | if id != rf.me && 65 | rf.matchIndex[id] >= idx && 66 | rf.logs[idx-baseIndex].LogTerm == rf.currentTerm { 67 | count++ 68 | } 69 | } 70 | if 2*count > len(rf.peers) { 71 | newCommitIndex = idx 72 | } 73 | } 74 | if newCommitIndex > rf.commitIndex { 75 | rf.commitIndex = newCommitIndex 76 | rf.chanCommit <- struct{}{} 77 | DPrintf("%s COMMITTED %s", rf, rf.details()) 78 | } 79 | 80 | for id := range rf.peers { 81 | if id != rf.me && rf.isLeader() { 82 | args := rf.newAppendEntriesArgs(id) 83 | go rf.sendAppendEntriesAndDealReply(id, args) 84 | } 85 | } 86 | } 87 | 88 | func (rf *Raft) sendAppendEntriesAndDealReply(id int, args AppendEntriesArgs) { 89 | var reply AppendEntriesReply 90 | 91 | DPrintf("%s AppendEntries to R%d with %s", rf, id, args) 92 | 93 | ok := rf.sendAppendEntries(id, args, &reply) 94 | if !ok { 95 | return 96 | } 97 | 98 | rf.mu.Lock() 99 | defer rf.mu.Unlock() 100 | 101 | if reply.Term > rf.currentTerm { 102 | rf.currentTerm = reply.Term 103 | rf.state = FOLLOWER 104 | rf.votedFor = NOBODY 105 | rf.persist() 106 | return 107 | } 108 | 109 | if rf.currentTerm != args.Term { 110 | // term 已经改变 111 | return 112 | } 113 | 114 | if !reply.Success { 115 | rf.nextIndex[id] = reply.NextIndex 116 | return 117 | } 118 | 119 | if len(args.Entries) == 0 { 120 | // 纯 heartBeat 就无需进一步处理了 121 | return 122 | } 123 | 124 | lastArgsLogIndex := args.Entries[len(args.Entries)-1].LogIndex 125 | rf.matchIndex[id] = lastArgsLogIndex 126 | rf.nextIndex[id] = lastArgsLogIndex + 1 127 | } 128 | 129 | // AppendEntries 会处理收到 AppendEntries RPC 130 | func (rf *Raft) AppendEntries(args AppendEntriesArgs, reply *AppendEntriesReply) { 131 | rf.mu.Lock() 132 | defer rf.mu.Unlock() 133 | 134 | // REVIEW: 按照 figure 2 中的内容来,重新编写此函数 135 | 136 | reply.Success = false 137 | 138 | // 1. Replay false at once if term < currentTerm 139 | if args.Term < rf.currentTerm { 140 | reply.Term = rf.currentTerm 141 | DPrintf("%s rejected %s", rf, args) 142 | return 143 | } 144 | 145 | defer rf.persist() 146 | 147 | rf.chanHeartBeat <- struct{}{} 148 | 149 | DPrintf("%s 收到了真实有效的信号 %s", rf, args) 150 | 151 | if args.Term > rf.currentTerm { 152 | rf.currentTerm = args.Term 153 | rf.state = FOLLOWER 154 | rf.votedFor = NOBODY 155 | } 156 | 157 | reply.Term = rf.currentTerm 158 | 159 | if args.PrevLogIndex > rf.getLastIndex() { 160 | reply.NextIndex = rf.getLastIndex() + 1 161 | return 162 | } 163 | 164 | baseIndex := rf.getBaseIndex() 165 | 166 | if args.PrevLogIndex > baseIndex { 167 | term := rf.logs[args.PrevLogIndex-baseIndex].LogTerm 168 | if args.PrevLogTerm != term { 169 | for i := args.PrevLogIndex - 1; i >= baseIndex; i-- { 170 | if rf.logs[i-baseIndex].LogTerm != term { 171 | reply.NextIndex = i + 1 172 | break 173 | } 174 | } 175 | return 176 | } 177 | } 178 | 179 | if args.PrevLogIndex >= baseIndex { 180 | rf.logs = rf.logs[:args.PrevLogIndex+1-baseIndex] 181 | rf.logs = append(rf.logs, args.Entries...) 182 | reply.Success = true 183 | reply.NextIndex = rf.getLastIndex() + 1 184 | } 185 | 186 | // 5. if leadercommit > commitIndex, set commitIndex = min(leaderCommit, index of last new entry) 187 | if args.LeaderCommit > rf.commitIndex { 188 | rf.commitIndex = min(args.LeaderCommit, rf.getLastIndex()) 189 | rf.chanCommit <- struct{}{} 190 | } 191 | 192 | } 193 | -------------------------------------------------------------------------------- /Raft/code/raft-LogEntry.go: -------------------------------------------------------------------------------- 1 | package raft 2 | 3 | // LogEntry is log entry 4 | type LogEntry struct { 5 | LogIndex int // raft.logs 会被压缩裁剪,需要保存此 log 在原本的索引号 6 | LogTerm int // LEADER 在生成此 log 时的 LEADER.currentTerm 7 | Command interface{} // 具体的命令内容 8 | } 9 | -------------------------------------------------------------------------------- /Raft/code/raft-Raft.go: -------------------------------------------------------------------------------- 1 | package raft 2 | 3 | import ( 4 | "fmt" 5 | "sync" 6 | 7 | "github.com/aQuaYi/Distributed-Algorithms/Raft/code/labrpc" 8 | ) 9 | 10 | const ( 11 | // NOBODY used for Raft.votedFor, means vote for none 12 | NOBODY = -1 13 | ) 14 | 15 | // Raft is 16 | // A Go object implementing a single Raft peer. 17 | // 18 | type Raft struct { 19 | mu sync.Mutex // Lock to protect shared access to this peer's state 20 | peers []*labrpc.ClientEnd // RPC end points of all peers 21 | persister *Persister // Object to hold this peer's persisted state 22 | me int // this peer's index into peers[] 23 | 24 | // Your data here (2A, 2B, 2C). 25 | // Look at the paper's Figure 2 for a description of what 26 | // state a Raft server must maintain. 27 | 28 | /* ↓ state of raft on Figure 2 ↓ */ 29 | 30 | // Persistent state on all servers: 31 | // "Persistent" 的意思是,一旦被修改,就要运行 rf.persist() 32 | currentTerm int // latest term server has seen. Initialized to 0. 33 | votedFor int // candidateID that received vote in current Term 34 | logs []LogEntry // NOTICE: first LogEntry.LogIndex is 1 35 | 36 | // Volatile state on all servers: initialized to 0, increase monotonically 37 | commitIndex int // index of highest log entry known to be committed 38 | lastApplied int // index of highest log entry known to be applied to state machine 39 | 40 | // Volatile state on leader: 41 | // nextIndex : for each server, index of the next log entry to send to that server 42 | // initialized to leader last LogIndex+1 43 | nextIndex []int 44 | // matchIndex : for each server, index of highest log entry known to be replicated on server 45 | // initialized to 0, increases monotonically 46 | matchIndex []int 47 | 48 | /* ↑ state of raft on Figure 2 ↑ */ 49 | 50 | state state 51 | voteCount int 52 | 53 | chanApply chan ApplyMsg 54 | 55 | //channel 56 | chanCommit chan struct{} 57 | chanHeartBeat chan struct{} 58 | chanBeElected chan struct{} 59 | } 60 | 61 | func (rf *Raft) String() string { 62 | return fmt.Sprintf(" ", 63 | rf.me, rf.currentTerm, rf.state, rf.commitIndex, rf.lastApplied) 64 | } 65 | 66 | func (rf *Raft) details() string { 67 | postfix := "" 68 | if rf.state == LEADER { 69 | postfix = fmt.Sprintf(", nextIndex%v, matchIndex%v", rf.nextIndex, rf.matchIndex) 70 | } 71 | return fmt.Sprintf("@@ votedFor:%2d, commitIndex:%d, lastApplied:%d, logs:%v%s @@", 72 | rf.votedFor, rf.commitIndex, rf.lastApplied, rf.logs, postfix) 73 | } 74 | -------------------------------------------------------------------------------- /Raft/code/raft-RequestVote.go: -------------------------------------------------------------------------------- 1 | package raft 2 | 3 | import "fmt" 4 | 5 | // RequestVoteArgs 获取投票参数 6 | // example RequestVote RPC arguments structure. 7 | // field names must start with capital letters! 8 | // 9 | type RequestVoteArgs struct { 10 | Term int // candidate's term 11 | CandidateID int // candidate requesting vote 12 | LastLogIndex int // index of candidate's last log entry 13 | LastLogTerm int // term of candidate's last log entry 14 | } 15 | 16 | func (a RequestVoteArgs) String() string { 17 | return fmt.Sprintf("voteArgs{R%d:T%d;LastIndex:%d;LastTerm:%d}", 18 | a.CandidateID, a.Term, a.LastLogIndex, a.LastLogTerm) 19 | } 20 | 21 | // RequestVoteReply is 22 | // example RequestVote RPC reply structure. 23 | // field names must start with capital letters! 24 | // 25 | type RequestVoteReply struct { 26 | Term int 27 | VoteGranted bool 28 | } 29 | 30 | func (reply RequestVoteReply) String() string { 31 | return fmt.Sprintf("voteReply{T%d,Granted:%t}", reply.Term, reply.VoteGranted) 32 | } 33 | 34 | // RequestVote is 35 | // example RequestVote RPC handler. 36 | // 37 | func (rf *Raft) RequestVote(args *RequestVoteArgs, reply *RequestVoteReply) { 38 | DPrintf("%s 收到投票请求 [%s]", rf, args) 39 | 40 | rf.mu.Lock() 41 | defer rf.mu.Unlock() 42 | 43 | // 1. replay false if term < currentTerm 44 | if args.Term < rf.currentTerm { 45 | reply.Term = rf.currentTerm 46 | reply.VoteGranted = false 47 | return 48 | } 49 | 50 | defer rf.persist() 51 | 52 | if args.Term > rf.currentTerm { 53 | rf.currentTerm = args.Term 54 | rf.state = FOLLOWER 55 | rf.votedFor = NOBODY 56 | } 57 | 58 | reply.Term = rf.currentTerm 59 | 60 | // 2. votedFor is null or candidateId and 61 | // candidate's log is at least as up-to-date as receiver's log, then grant vote 62 | // If the logs have last entries with different terms, then the log with the later term is more up-to-date 63 | // If the logs end with the same term, then whichever log is longer is more up-to-date 64 | 65 | if isValidArgs(rf, args) { 66 | reply.VoteGranted = true 67 | rf.chanHeartBeat <- struct{}{} 68 | rf.votedFor = args.CandidateID 69 | DPrintf("%s voted for %s", rf, args) 70 | return 71 | } 72 | DPrintf("%s **NOT** voted for %s", rf, args) 73 | } 74 | 75 | func isValidArgs(rf *Raft, args *RequestVoteArgs) bool { 76 | term := rf.getLastTerm() 77 | index := rf.getLastIndex() 78 | return (rf.votedFor == NOBODY || rf.votedFor == args.CandidateID) && 79 | isUpToDate(args, term, index) 80 | } 81 | 82 | func isUpToDate(args *RequestVoteArgs, term, index int) bool { 83 | return (args.LastLogTerm > term) || 84 | (args.LastLogTerm == term && args.LastLogIndex >= index) 85 | } 86 | 87 | func (rf *Raft) broadcastRequestVote() { 88 | var args RequestVoteArgs 89 | 90 | rf.mu.Lock() 91 | args.Term = rf.currentTerm 92 | args.CandidateID = rf.me 93 | args.LastLogTerm = rf.getLastTerm() 94 | args.LastLogIndex = rf.getLastIndex() 95 | rf.mu.Unlock() 96 | 97 | for i := range rf.peers { 98 | if i != rf.me && rf.isCandidate() { 99 | go rf.sendRequestVoteAndDealReply(i, args) 100 | } 101 | } 102 | } 103 | 104 | func (rf *Raft) sendRequestVoteAndDealReply(i int, args RequestVoteArgs) { 105 | var reply RequestVoteReply 106 | 107 | DPrintf("%s RequestVote to %d", rf, i) 108 | 109 | ok := rf.sendRequestVote(i, &args, &reply) 110 | if !ok { 111 | return 112 | } 113 | 114 | rf.mu.Lock() 115 | defer rf.mu.Unlock() 116 | 117 | if reply.Term > rf.currentTerm { 118 | rf.currentTerm = reply.Term 119 | rf.state = FOLLOWER 120 | rf.votedFor = NOBODY 121 | rf.persist() 122 | return 123 | } 124 | 125 | if rf.currentTerm != args.Term || !reply.VoteGranted { 126 | // term 已经改变 或 没有投我的票 127 | return 128 | } 129 | 130 | rf.voteCount++ 131 | if 2*rf.voteCount > len(rf.peers) && rf.isCandidate() { 132 | rf.chanBeElected <- struct{}{} 133 | } 134 | } 135 | 136 | // 137 | // example code to send a RequestVote RPC to a server. 138 | // server is the index of the target server in rf.peers[]. 139 | // expects RPC arguments in args. 140 | // fills in *reply with RPC reply, so caller should 141 | // pass &reply. 142 | // the types of the args and reply passed to Call() must be 143 | // the same as the types of the arguments declared in the 144 | // handler function (including whether they are pointers). 145 | // 146 | // The labrpc package simulates a lossy network, in which servers 147 | // may be unreachable, and in which requests and replies may be lost. 148 | // Call() sends a request and waits for a reply. If a reply arrives 149 | // within a timeout interval, Call() returns true; otherwise 150 | // Call() returns false. Thus Call() may not return for a while. 151 | // A false return can be caused by a dead server, a live server that 152 | // can't be reached, a lost request, or a lost reply. 153 | // 154 | // Call() is guaranteed to return (perhaps after a delay) *except* if the 155 | // handler function on the server side does not return. Thus there 156 | // is no need to implement your own timeouts around Call(). 157 | // 158 | // look at the comments in ../labrpc/labrpc.go for more details. 159 | // 160 | // if you're having trouble getting RPC to work, check that you've 161 | // capitalized all field names in struts passed over RPC, and 162 | // that the caller passes the address of the reply struct with &, not 163 | // the struct itself. 164 | // 165 | func (rf *Raft) sendRequestVote(server int, args *RequestVoteArgs, reply *RequestVoteReply) bool { 166 | return rf.peers[server].Call("Raft.RequestVote", args, reply) 167 | } 168 | -------------------------------------------------------------------------------- /Raft/code/raft-method.go: -------------------------------------------------------------------------------- 1 | package raft 2 | 3 | // 这里的方法都是被内部引用的,所以无需加锁 4 | 5 | func (rf *Raft) getLastIndex() int { 6 | return rf.logs[len(rf.logs)-1].LogIndex 7 | } 8 | 9 | func (rf *Raft) getBaseIndex() int { 10 | return rf.logs[0].LogIndex 11 | } 12 | 13 | func (rf *Raft) getLastTerm() int { 14 | return rf.logs[len(rf.logs)-1].LogTerm 15 | } 16 | 17 | func (rf *Raft) isLeader() bool { 18 | return rf.state == LEADER 19 | } 20 | 21 | func (rf *Raft) isCandidate() bool { 22 | return rf.state == CANDIDATE 23 | } 24 | 25 | func (rf *Raft) isFollower() bool { 26 | return rf.state == FOLLOWER 27 | } 28 | -------------------------------------------------------------------------------- /Raft/code/raft-persist.go: -------------------------------------------------------------------------------- 1 | package raft 2 | 3 | import ( 4 | "bytes" 5 | "encoding/gob" 6 | 7 | "github.com/aQuaYi/Distributed-Algorithms/Raft/code/labgob" 8 | ) 9 | 10 | // 11 | // save Raft's persistent state to stable storage, 12 | // where it can later be retrieved after a crash and restart. 13 | // see paper's Figure 2 for a description of what should be persistent. 14 | // 15 | func (rf *Raft) persist() { 16 | // Your code here (2C). 17 | // Example: 18 | w := new(bytes.Buffer) 19 | e := labgob.NewEncoder(w) 20 | e.Encode(rf.currentTerm) 21 | e.Encode(rf.votedFor) 22 | e.Encode(rf.logs) 23 | data := w.Bytes() 24 | rf.persister.SaveRaftState(data) 25 | 26 | DPrintf("%s PERSISTED", rf) 27 | } 28 | 29 | // 30 | // restore previously persisted state. 31 | // 32 | func (rf *Raft) readPersist(data []byte) { 33 | if data == nil || len(data) < 1 { // bootstrap without any state? 34 | return 35 | } 36 | r := bytes.NewBuffer(data) 37 | d := gob.NewDecoder(r) 38 | d.Decode(&rf.currentTerm) 39 | d.Decode(&rf.votedFor) 40 | d.Decode(&rf.logs) 41 | } 42 | -------------------------------------------------------------------------------- /Raft/code/raft-settings.go: -------------------------------------------------------------------------------- 1 | package raft 2 | 3 | import ( 4 | "log" 5 | "math/rand" 6 | "time" 7 | ) 8 | 9 | func init() { 10 | log.SetFlags(log.LstdFlags | log.Lmicroseconds) 11 | DPrintf("程序开始运行") 12 | } 13 | 14 | const ( 15 | // heartBeat 发送心跳的时间间隔,ms 16 | heartBeat = 50 * time.Millisecond 17 | // minElection 选举过期的最小时间间隔,ms 18 | minElection = heartBeat * 10 19 | // minElectionInterval 选举过期的最大时间间隔,ms 20 | maxElection = minElection * 8 / 5 21 | 22 | // 按照论文 5.6 Timing and availability 的要求 23 | // heartBeat 和 minElection 需要相差了一个数量级 24 | ) 25 | 26 | func electionTimeout() time.Duration { 27 | interval := int(minElection) + 28 | rand.Intn(int(maxElection-minElection)) 29 | return time.Duration(interval) 30 | } 31 | -------------------------------------------------------------------------------- /Raft/code/raft-settings_test.go: -------------------------------------------------------------------------------- 1 | package raft 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | 7 | "github.com/stretchr/testify/assert" 8 | ) 9 | 10 | func Test_ElectionTimeout(t *testing.T) { 11 | ast := assert.New(t) 12 | for i := 0; i < 1000; i++ { 13 | rate := electionTimeout() / heartBeat 14 | ast.True(rate >= 10, "electionTimeout 没有比 heartBeatInterval 大 10 倍") 15 | } 16 | } 17 | 18 | func Test_heartBeat_isInRange(t *testing.T) { 19 | ast := assert.New(t) 20 | minInterval := 30 * time.Millisecond 21 | maxInterval := 100 * time.Millisecond 22 | isInRange := minInterval <= heartBeat && heartBeat <= maxInterval 23 | ast.True(isInRange, " heartBeat 设置的过大或过小") 24 | } 25 | -------------------------------------------------------------------------------- /Raft/code/raft-state.go: -------------------------------------------------------------------------------- 1 | package raft 2 | 3 | type state int 4 | 5 | // 规定了 server 所需的 3 种状态 6 | const ( 7 | LEADER state = iota 8 | CANDIDATE 9 | FOLLOWER 10 | ) 11 | 12 | func (s state) String() string { 13 | switch s { 14 | case LEADER: 15 | return "Leader" 16 | case CANDIDATE: 17 | return "Candidate" 18 | case FOLLOWER: 19 | return "Follower" 20 | default: 21 | panic("出现了第4种 server state") 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /Raft/code/raft-state_test.go: -------------------------------------------------------------------------------- 1 | package raft 2 | 3 | import "testing" 4 | 5 | func Test_state_String(t *testing.T) { 6 | tests := []struct { 7 | name string 8 | s state 9 | want string 10 | }{ 11 | 12 | { 13 | "Follower", 14 | FOLLOWER, 15 | "Follower", 16 | }, 17 | 18 | { 19 | "Candidate", 20 | CANDIDATE, 21 | "Candidate", 22 | }, 23 | 24 | { 25 | "Leader", 26 | LEADER, 27 | "Leader", 28 | }, 29 | } 30 | for _, tt := range tests { 31 | t.Run(tt.name, func(t *testing.T) { 32 | if got := tt.s.String(); got != tt.want { 33 | t.Errorf("state.String() = %v, want %v", got, tt.want) 34 | } 35 | }) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /Raft/code/test_test.go: -------------------------------------------------------------------------------- 1 | package raft 2 | 3 | // 4 | // Raft tests. 5 | // 6 | // we will use the original test_test.go to test your code for grading. 7 | // so, while you can modify this code to help you debug, please 8 | // test with the original before submitting. 9 | // 10 | 11 | import "testing" 12 | import "fmt" 13 | import "time" 14 | import "math/rand" 15 | import "sync/atomic" 16 | import "sync" 17 | 18 | // The tester generously allows solutions to complete elections in one second 19 | // (much more than the paper's range of timeouts). 20 | const RaftElectionTimeout = 1000 * time.Millisecond 21 | 22 | func TestInitialElection2A(t *testing.T) { 23 | servers := 3 24 | cfg := makeConfig(t, servers, false) 25 | defer cfg.cleanup() 26 | 27 | cfg.begin("Test (2A): InitialElection2A initial election") 28 | // is a leader elected? 29 | cfg.checkOneLeader() 30 | 31 | // sleep a bit to avoid racing with followers learning of the 32 | // election, then check that all peers agree on the term. 33 | time.Sleep(50 * time.Millisecond) 34 | term1 := cfg.checkTerms() 35 | 36 | // does the leader+term stay the same if there is no network failure? 37 | time.Sleep(2 * RaftElectionTimeout) 38 | term2 := cfg.checkTerms() 39 | if term1 != term2 { 40 | fmt.Printf("warning: term changed even though there were no failures") 41 | } 42 | 43 | // there should still be a leader. 44 | cfg.checkOneLeader() 45 | 46 | cfg.end() 47 | } 48 | 49 | func TestReElection2A(t *testing.T) { 50 | servers := 3 51 | cfg := makeConfig(t, servers, false) 52 | defer cfg.cleanup() 53 | 54 | cfg.begin("Test (2A): ReElection2A election after network failure") 55 | 56 | leader1 := cfg.checkOneLeader() 57 | 58 | // if the leader disconnects, a new one should be elected. 59 | cfg.disconnect(leader1) 60 | cfg.checkOneLeader() 61 | 62 | // if the old leader rejoins, that shouldn't 63 | // disturb the new leader. 64 | cfg.connect(leader1) 65 | leader2 := cfg.checkOneLeader() 66 | 67 | // if there's no quorum, no leader should 68 | // be elected. 69 | cfg.disconnect(leader2) 70 | cfg.disconnect((leader2 + 1) % servers) 71 | time.Sleep(2 * RaftElectionTimeout) 72 | cfg.checkNoLeader() 73 | 74 | // if a quorum arises, it should elect a leader. 75 | cfg.connect((leader2 + 1) % servers) 76 | cfg.checkOneLeader() 77 | 78 | // re-join of last node shouldn't prevent leader from existing. 79 | cfg.connect(leader2) 80 | cfg.checkOneLeader() 81 | 82 | cfg.end() 83 | } 84 | 85 | func TestBasicAgree2B(t *testing.T) { 86 | servers := 5 87 | cfg := makeConfig(t, servers, false) 88 | defer cfg.cleanup() 89 | 90 | cfg.begin("Test (2B): BasicAgree2B basic agreement") 91 | 92 | iters := 3 93 | for index := 1; index < iters+1; index++ { 94 | nd, _ := cfg.nCommitted(index) 95 | if nd > 0 { 96 | t.Fatalf("some have committed before Start()") 97 | } 98 | 99 | xindex := cfg.one(index*100, servers, false) 100 | if xindex != index { 101 | t.Fatalf("got index %v but expected %v", xindex, index) 102 | } 103 | } 104 | 105 | cfg.end() 106 | } 107 | 108 | func TestFailAgree2B(t *testing.T) { 109 | servers := 3 110 | cfg := makeConfig(t, servers, false) 111 | defer cfg.cleanup() 112 | 113 | cfg.begin("Test (2B): FailAgree2B agreement despite follower disconnection") 114 | 115 | cfg.one(101, servers, false) 116 | 117 | // follower network disconnection 118 | leader := cfg.checkOneLeader() 119 | cfg.disconnect((leader + 1) % servers) 120 | 121 | // agree despite one disconnected server? 122 | cfg.one(102, servers-1, false) 123 | cfg.one(103, servers-1, false) 124 | time.Sleep(RaftElectionTimeout) 125 | cfg.one(104, servers-1, false) 126 | cfg.one(105, servers-1, false) 127 | 128 | // re-connect 129 | cfg.connect((leader + 1) % servers) 130 | 131 | // agree with full set of servers? 132 | cfg.one(106, servers, true) 133 | time.Sleep(RaftElectionTimeout) 134 | cfg.one(107, servers, true) 135 | 136 | cfg.end() 137 | } 138 | 139 | func TestFailNoAgree2B(t *testing.T) { 140 | servers := 5 141 | cfg := makeConfig(t, servers, false) 142 | defer cfg.cleanup() 143 | 144 | cfg.begin("Test (2B): FailNoAgree2B no agreement if too many followers disconnect") 145 | 146 | cfg.one(10, servers, false) 147 | 148 | // 3 of 5 followers disconnect 149 | leader := cfg.checkOneLeader() 150 | cfg.disconnect((leader + 1) % servers) 151 | cfg.disconnect((leader + 2) % servers) 152 | cfg.disconnect((leader + 3) % servers) 153 | 154 | index, _, ok := cfg.rafts[leader].Start(20) 155 | if ok != true { 156 | t.Fatalf("leader rejected Start()") 157 | } 158 | if index != 2 { 159 | t.Fatalf("expected index 2, got %v", index) 160 | } 161 | 162 | time.Sleep(2 * RaftElectionTimeout) 163 | 164 | n, _ := cfg.nCommitted(index) 165 | if n > 0 { 166 | t.Fatalf("%v committed but no majority", n) 167 | } 168 | 169 | // repair 170 | cfg.connect((leader + 1) % servers) 171 | cfg.connect((leader + 2) % servers) 172 | cfg.connect((leader + 3) % servers) 173 | 174 | // the disconnected majority may have chosen a leader from 175 | // among their own ranks, forgetting index 2. 176 | leader2 := cfg.checkOneLeader() 177 | index2, _, ok2 := cfg.rafts[leader2].Start(30) 178 | if ok2 == false { 179 | t.Fatalf("leader2 rejected Start()") 180 | } 181 | if index2 < 2 || index2 > 3 { 182 | t.Fatalf("unexpected index %v", index2) 183 | } 184 | 185 | cfg.one(1000, servers, true) 186 | 187 | cfg.end() 188 | } 189 | 190 | func TestConcurrentStarts2B(t *testing.T) { 191 | servers := 3 192 | cfg := makeConfig(t, servers, false) 193 | defer cfg.cleanup() 194 | 195 | cfg.begin("Test (2B): ConcurrentStarts2B concurrent Start()s") 196 | 197 | var success bool 198 | loop: 199 | for try := 0; try < 5; try++ { 200 | if try > 0 { 201 | // give solution some time to settle 202 | time.Sleep(3 * time.Second) 203 | } 204 | 205 | leader := cfg.checkOneLeader() 206 | _, term, ok := cfg.rafts[leader].Start(1) 207 | if !ok { 208 | // leader moved on really quickly 209 | continue 210 | } 211 | 212 | iters := 5 213 | var wg sync.WaitGroup 214 | is := make(chan int, iters) 215 | for ii := 0; ii < iters; ii++ { 216 | wg.Add(1) 217 | go func(i int) { 218 | defer wg.Done() 219 | i, term1, ok := cfg.rafts[leader].Start(100 + i) 220 | if term1 != term { 221 | return 222 | } 223 | if ok != true { 224 | return 225 | } 226 | is <- i 227 | }(ii) 228 | } 229 | 230 | wg.Wait() 231 | close(is) 232 | 233 | for j := 0; j < servers; j++ { 234 | if t, _ := cfg.rafts[j].GetState(); t != term { 235 | // term changed -- can't expect low RPC counts 236 | continue loop 237 | } 238 | } 239 | 240 | failed := false 241 | cmds := []int{} 242 | for index := range is { 243 | cmd := cfg.wait(index, servers, term) 244 | if ix, ok := cmd.(int); ok { 245 | if ix == -1 { 246 | // peers have moved on to later terms 247 | // so we can't expect all Start()s to 248 | // have succeeded 249 | failed = true 250 | break 251 | } 252 | cmds = append(cmds, ix) 253 | } else { 254 | t.Fatalf("value %v is not an int", cmd) 255 | } 256 | } 257 | 258 | if failed { 259 | // avoid leaking goroutines 260 | go func() { 261 | for range is { 262 | } 263 | }() 264 | continue 265 | } 266 | 267 | for ii := 0; ii < iters; ii++ { 268 | x := 100 + ii 269 | ok := false 270 | for j := 0; j < len(cmds); j++ { 271 | if cmds[j] == x { 272 | ok = true 273 | } 274 | } 275 | if ok == false { 276 | t.Fatalf("cmd %v missing in %v", x, cmds) 277 | } 278 | } 279 | 280 | success = true 281 | break 282 | } 283 | 284 | if !success { 285 | t.Fatalf("term changed too often") 286 | } 287 | 288 | cfg.end() 289 | } 290 | 291 | func TestRejoin2B(t *testing.T) { 292 | servers := 3 293 | cfg := makeConfig(t, servers, false) 294 | defer cfg.cleanup() 295 | 296 | cfg.begin("Test (2B): Rejoin2B rejoin of partitioned leader") 297 | 298 | cfg.one(101, servers, true) 299 | 300 | // leader network failure 301 | leader1 := cfg.checkOneLeader() 302 | cfg.disconnect(leader1) 303 | 304 | // make old leader try to agree on some entries 305 | cfg.rafts[leader1].Start(102) 306 | cfg.rafts[leader1].Start(103) 307 | cfg.rafts[leader1].Start(104) 308 | 309 | // new leader commits, also for index=2 310 | cfg.one(103, 2, true) 311 | 312 | // new leader network failure 313 | leader2 := cfg.checkOneLeader() 314 | cfg.disconnect(leader2) 315 | 316 | // old leader connected again 317 | cfg.connect(leader1) 318 | 319 | cfg.one(104, 2, true) 320 | 321 | // all together now 322 | cfg.connect(leader2) 323 | 324 | cfg.one(105, servers, true) 325 | 326 | cfg.end() 327 | } 328 | 329 | func TestBackup2B(t *testing.T) { 330 | servers := 5 331 | cfg := makeConfig(t, servers, false) 332 | defer cfg.cleanup() 333 | 334 | cfg.begin("Test (2B): Backup2B leader backs up quickly over incorrect follower logs") 335 | 336 | cfg.one(rand.Int(), servers, true) 337 | 338 | // put leader and one follower in a partition 339 | leader1 := cfg.checkOneLeader() 340 | cfg.disconnect((leader1 + 2) % servers) 341 | cfg.disconnect((leader1 + 3) % servers) 342 | cfg.disconnect((leader1 + 4) % servers) 343 | 344 | // submit lots of commands that won't commit 345 | for i := 0; i < 50; i++ { 346 | cfg.rafts[leader1].Start(rand.Int()) 347 | } 348 | 349 | time.Sleep(RaftElectionTimeout / 2) 350 | 351 | cfg.disconnect((leader1 + 0) % servers) 352 | cfg.disconnect((leader1 + 1) % servers) 353 | 354 | // allow other partition to recover 355 | cfg.connect((leader1 + 2) % servers) 356 | cfg.connect((leader1 + 3) % servers) 357 | cfg.connect((leader1 + 4) % servers) 358 | 359 | // lots of successful commands to new group. 360 | for i := 0; i < 50; i++ { 361 | cfg.one(rand.Int(), 3, true) 362 | } 363 | 364 | // now another partitioned leader and one follower 365 | leader2 := cfg.checkOneLeader() 366 | other := (leader1 + 2) % servers 367 | if leader2 == other { 368 | other = (leader2 + 1) % servers 369 | } 370 | cfg.disconnect(other) 371 | 372 | // lots more commands that won't commit 373 | for i := 0; i < 50; i++ { 374 | cfg.rafts[leader2].Start(rand.Int()) 375 | } 376 | 377 | time.Sleep(RaftElectionTimeout / 2) 378 | 379 | // bring original leader back to life, 380 | for i := 0; i < servers; i++ { 381 | cfg.disconnect(i) 382 | } 383 | cfg.connect((leader1 + 0) % servers) 384 | cfg.connect((leader1 + 1) % servers) 385 | cfg.connect(other) 386 | 387 | // lots of successful commands to new group. 388 | for i := 0; i < 50; i++ { 389 | cfg.one(rand.Int(), 3, true) 390 | } 391 | 392 | // now everyone 393 | for i := 0; i < servers; i++ { 394 | cfg.connect(i) 395 | } 396 | cfg.one(rand.Int(), servers, true) 397 | 398 | cfg.end() 399 | } 400 | 401 | func TestCount2B(t *testing.T) { 402 | servers := 3 403 | cfg := makeConfig(t, servers, false) 404 | defer cfg.cleanup() 405 | 406 | cfg.begin("Test (2B): Count2B RPC counts aren't too high") 407 | 408 | rpcs := func() (n int) { 409 | for j := 0; j < servers; j++ { 410 | n += cfg.rpcCount(j) 411 | } 412 | return 413 | } 414 | 415 | leader := cfg.checkOneLeader() 416 | 417 | total1 := rpcs() 418 | 419 | if total1 > 30 || total1 < 1 { 420 | t.Fatalf("too many or few RPCs (%v) to elect initial leader\n", total1) 421 | } 422 | 423 | var total2 int 424 | var success bool 425 | loop: 426 | for try := 0; try < 5; try++ { 427 | if try > 0 { 428 | // give solution some time to settle 429 | time.Sleep(3 * time.Second) 430 | } 431 | 432 | leader = cfg.checkOneLeader() 433 | total1 = rpcs() 434 | 435 | iters := 10 436 | starti, term, ok := cfg.rafts[leader].Start(1) 437 | if !ok { 438 | // leader moved on really quickly 439 | continue 440 | } 441 | cmds := []int{} 442 | for i := 1; i < iters+2; i++ { 443 | x := int(rand.Int31()) 444 | cmds = append(cmds, x) 445 | index1, term1, ok := cfg.rafts[leader].Start(x) 446 | if term1 != term { 447 | // Term changed while starting 448 | continue loop 449 | } 450 | if !ok { 451 | // No longer the leader, so term has changed 452 | continue loop 453 | } 454 | if starti+i != index1 { 455 | t.Fatalf("Start() failed") 456 | } 457 | } 458 | 459 | for i := 1; i < iters+1; i++ { 460 | cmd := cfg.wait(starti+i, servers, term) 461 | if ix, ok := cmd.(int); ok == false || ix != cmds[i-1] { 462 | if ix == -1 { 463 | // term changed -- try again 464 | continue loop 465 | } 466 | t.Fatalf("wrong value %v committed for index %v; expected %v\n", cmd, starti+i, cmds) 467 | } 468 | } 469 | 470 | failed := false 471 | total2 = 0 472 | for j := 0; j < servers; j++ { 473 | if t, _ := cfg.rafts[j].GetState(); t != term { 474 | // term changed -- can't expect low RPC counts 475 | // need to keep going to update total2 476 | failed = true 477 | } 478 | total2 += cfg.rpcCount(j) 479 | } 480 | 481 | if failed { 482 | continue loop 483 | } 484 | 485 | if total2-total1 > (iters+1+3)*3 { 486 | t.Fatalf("too many RPCs (%v) for %v entries\n", total2-total1, iters) 487 | } 488 | 489 | success = true 490 | break 491 | } 492 | 493 | if !success { 494 | t.Fatalf("term changed too often") 495 | } 496 | 497 | time.Sleep(RaftElectionTimeout) 498 | 499 | total3 := 0 500 | for j := 0; j < servers; j++ { 501 | total3 += cfg.rpcCount(j) 502 | } 503 | 504 | if total3-total2 > 3*20 { 505 | t.Fatalf("too many RPCs (%v) for 1 second of idleness\n", total3-total2) 506 | } 507 | 508 | cfg.end() 509 | } 510 | 511 | func TestPersist12C(t *testing.T) { 512 | servers := 3 513 | cfg := makeConfig(t, servers, false) 514 | defer cfg.cleanup() 515 | 516 | cfg.begin("Test (2C): Persist12C basic persistence") 517 | 518 | cfg.one(11, servers, true) 519 | 520 | // crash and re-start all 521 | for i := 0; i < servers; i++ { 522 | cfg.start1(i) 523 | } 524 | for i := 0; i < servers; i++ { 525 | cfg.disconnect(i) 526 | cfg.connect(i) 527 | } 528 | 529 | cfg.one(12, servers, true) 530 | 531 | leader1 := cfg.checkOneLeader() 532 | cfg.disconnect(leader1) 533 | cfg.start1(leader1) 534 | cfg.connect(leader1) 535 | 536 | cfg.one(13, servers, true) 537 | 538 | leader2 := cfg.checkOneLeader() 539 | cfg.disconnect(leader2) 540 | cfg.one(14, servers-1, true) 541 | cfg.start1(leader2) 542 | cfg.connect(leader2) 543 | 544 | cfg.wait(4, servers, -1) // wait for leader2 to join before killing i3 545 | 546 | i3 := (cfg.checkOneLeader() + 1) % servers 547 | cfg.disconnect(i3) 548 | cfg.one(15, servers-1, true) 549 | cfg.start1(i3) 550 | cfg.connect(i3) 551 | 552 | cfg.one(16, servers, true) 553 | 554 | cfg.end() 555 | } 556 | 557 | func TestPersist22C(t *testing.T) { 558 | servers := 5 559 | cfg := makeConfig(t, servers, false) 560 | defer cfg.cleanup() 561 | 562 | cfg.begin("Test (2C): Persist22C more persistence") 563 | 564 | index := 1 565 | for iters := 0; iters < 5; iters++ { 566 | cfg.one(10+index, servers, true) 567 | index++ 568 | 569 | leader1 := cfg.checkOneLeader() 570 | 571 | cfg.disconnect((leader1 + 1) % servers) 572 | cfg.disconnect((leader1 + 2) % servers) 573 | 574 | cfg.one(10+index, servers-2, true) 575 | index++ 576 | 577 | cfg.disconnect((leader1 + 0) % servers) 578 | cfg.disconnect((leader1 + 3) % servers) 579 | cfg.disconnect((leader1 + 4) % servers) 580 | 581 | cfg.start1((leader1 + 1) % servers) 582 | cfg.start1((leader1 + 2) % servers) 583 | cfg.connect((leader1 + 1) % servers) 584 | cfg.connect((leader1 + 2) % servers) 585 | 586 | time.Sleep(RaftElectionTimeout) 587 | 588 | cfg.start1((leader1 + 3) % servers) 589 | cfg.connect((leader1 + 3) % servers) 590 | 591 | cfg.one(10+index, servers-2, true) 592 | index++ 593 | 594 | cfg.connect((leader1 + 4) % servers) 595 | cfg.connect((leader1 + 0) % servers) 596 | } 597 | 598 | cfg.one(1000, servers, true) 599 | 600 | cfg.end() 601 | } 602 | 603 | func TestPersist32C(t *testing.T) { 604 | servers := 3 605 | cfg := makeConfig(t, servers, false) 606 | defer cfg.cleanup() 607 | 608 | cfg.begin("Test (2C): Persist32C partitioned leader and one follower crash, leader restarts") 609 | 610 | cfg.one(101, 3, true) 611 | 612 | leader := cfg.checkOneLeader() 613 | cfg.disconnect((leader + 2) % servers) 614 | 615 | cfg.one(102, 2, true) 616 | 617 | cfg.crash1((leader + 0) % servers) 618 | cfg.crash1((leader + 1) % servers) 619 | cfg.connect((leader + 2) % servers) 620 | cfg.start1((leader + 0) % servers) 621 | cfg.connect((leader + 0) % servers) 622 | 623 | cfg.one(103, 2, true) 624 | 625 | cfg.start1((leader + 1) % servers) 626 | cfg.connect((leader + 1) % servers) 627 | 628 | cfg.one(104, servers, true) 629 | 630 | cfg.end() 631 | } 632 | 633 | // 634 | // Test the scenarios described in Figure 8 of the extended Raft paper. Each 635 | // iteration asks a leader, if there is one, to insert a command in the Raft 636 | // log. If there is a leader, that leader will fail quickly with a high 637 | // probability (perhaps without committing the command), or crash after a while 638 | // with low probability (most likely committing the command). If the number of 639 | // alive servers isn't enough to form a majority, perhaps start a new server. 640 | // The leader in a new term may try to finish replicating log entries that 641 | // haven't been committed yet. 642 | // 643 | func TestFigure82C(t *testing.T) { 644 | servers := 5 645 | cfg := makeConfig(t, servers, false) 646 | defer cfg.cleanup() 647 | 648 | cfg.begin("Test (2C): Figure82C Figure 8") 649 | 650 | cfg.one(rand.Int(), 1, true) 651 | 652 | nup := servers 653 | for iters := 0; iters < 1000; iters++ { 654 | leader := -1 655 | for i := 0; i < servers; i++ { 656 | if cfg.rafts[i] != nil { 657 | _, _, ok := cfg.rafts[i].Start(rand.Int()) 658 | if ok { 659 | leader = i 660 | } 661 | } 662 | } 663 | 664 | if (rand.Int() % 1000) < 100 { 665 | ms := rand.Int63() % (int64(RaftElectionTimeout/time.Millisecond) / 2) 666 | time.Sleep(time.Duration(ms) * time.Millisecond) 667 | } else { 668 | ms := (rand.Int63() % 13) 669 | time.Sleep(time.Duration(ms) * time.Millisecond) 670 | } 671 | 672 | if leader != -1 { 673 | cfg.crash1(leader) 674 | nup-- 675 | } 676 | 677 | if nup < 3 { 678 | s := rand.Int() % servers 679 | if cfg.rafts[s] == nil { 680 | cfg.start1(s) 681 | cfg.connect(s) 682 | nup++ 683 | } 684 | } 685 | } 686 | 687 | for i := 0; i < servers; i++ { 688 | if cfg.rafts[i] == nil { 689 | cfg.start1(i) 690 | cfg.connect(i) 691 | } 692 | } 693 | 694 | cfg.one(rand.Int(), servers, true) 695 | 696 | cfg.end() 697 | } 698 | 699 | func TestUnreliableAgree2C(t *testing.T) { 700 | servers := 5 701 | cfg := makeConfig(t, servers, true) 702 | defer cfg.cleanup() 703 | 704 | cfg.begin("Test (2C): UnreliableAgree2C unreliable agreement") 705 | 706 | var wg sync.WaitGroup 707 | 708 | for iters := 1; iters < 50; iters++ { 709 | for j := 0; j < 4; j++ { 710 | wg.Add(1) 711 | go func(iters, j int) { 712 | defer wg.Done() 713 | cfg.one((100*iters)+j, 1, true) 714 | }(iters, j) 715 | } 716 | cfg.one(iters, 1, true) 717 | } 718 | 719 | cfg.setunreliable(false) 720 | 721 | wg.Wait() 722 | 723 | cfg.one(100, servers, true) 724 | 725 | cfg.end() 726 | } 727 | 728 | func TestFigure8Unreliable2C(t *testing.T) { 729 | servers := 5 730 | cfg := makeConfig(t, servers, true) 731 | defer cfg.cleanup() 732 | 733 | cfg.begin("Test (2C): Figure8Unreliable2C Figure 8 (unreliable)") 734 | 735 | cfg.one(rand.Int()%10000, 1, true) 736 | 737 | nup := servers 738 | for iters := 0; iters < 1000; iters++ { 739 | if iters == 200 { 740 | cfg.setlongreordering(true) 741 | } 742 | leader := -1 743 | for i := 0; i < servers; i++ { 744 | _, _, ok := cfg.rafts[i].Start(rand.Int() % 10000) 745 | if ok && cfg.connected[i] { 746 | leader = i 747 | } 748 | } 749 | 750 | if (rand.Int() % 1000) < 100 { 751 | ms := rand.Int63() % (int64(RaftElectionTimeout/time.Millisecond) / 2) 752 | time.Sleep(time.Duration(ms) * time.Millisecond) 753 | } else { 754 | ms := (rand.Int63() % 13) 755 | time.Sleep(time.Duration(ms) * time.Millisecond) 756 | } 757 | 758 | if leader != -1 && (rand.Int()%1000) < int(RaftElectionTimeout/time.Millisecond)/2 { 759 | cfg.disconnect(leader) 760 | nup-- 761 | } 762 | 763 | if nup < 3 { 764 | s := rand.Int() % servers 765 | if cfg.connected[s] == false { 766 | cfg.connect(s) 767 | nup++ 768 | } 769 | } 770 | } 771 | 772 | for i := 0; i < servers; i++ { 773 | if cfg.connected[i] == false { 774 | cfg.connect(i) 775 | } 776 | } 777 | 778 | cfg.one(rand.Int()%10000, servers, true) 779 | 780 | cfg.end() 781 | } 782 | 783 | func internalChurn(t *testing.T, unreliable bool) { 784 | 785 | servers := 5 786 | cfg := makeConfig(t, servers, unreliable) 787 | defer cfg.cleanup() 788 | 789 | if unreliable { 790 | cfg.begin("Test (2C): unreliable churn") 791 | } else { 792 | cfg.begin("Test (2C): churn") 793 | } 794 | 795 | stop := int32(0) 796 | 797 | // create concurrent clients 798 | cfn := func(me int, ch chan []int) { 799 | var ret []int 800 | ret = nil 801 | defer func() { ch <- ret }() 802 | values := []int{} 803 | for atomic.LoadInt32(&stop) == 0 { 804 | x := rand.Int() 805 | index := -1 806 | ok := false 807 | for i := 0; i < servers; i++ { 808 | // try them all, maybe one of them is a leader 809 | cfg.mu.Lock() 810 | rf := cfg.rafts[i] 811 | cfg.mu.Unlock() 812 | if rf != nil { 813 | index1, _, ok1 := rf.Start(x) 814 | if ok1 { 815 | ok = ok1 816 | index = index1 817 | } 818 | } 819 | } 820 | if ok { 821 | // maybe leader will commit our value, maybe not. 822 | // but don't wait forever. 823 | for _, to := range []int{10, 20, 50, 100, 200} { 824 | nd, cmd := cfg.nCommitted(index) 825 | if nd > 0 { 826 | if xx, ok := cmd.(int); ok { 827 | if xx == x { 828 | values = append(values, x) 829 | } 830 | } else { 831 | cfg.t.Fatalf("wrong command type") 832 | } 833 | break 834 | } 835 | time.Sleep(time.Duration(to) * time.Millisecond) 836 | } 837 | } else { 838 | time.Sleep(time.Duration(79+me*17) * time.Millisecond) 839 | } 840 | } 841 | ret = values 842 | } 843 | 844 | ncli := 3 845 | cha := []chan []int{} 846 | for i := 0; i < ncli; i++ { 847 | cha = append(cha, make(chan []int)) 848 | go cfn(i, cha[i]) 849 | } 850 | 851 | for iters := 0; iters < 20; iters++ { 852 | if (rand.Int() % 1000) < 200 { 853 | i := rand.Int() % servers 854 | cfg.disconnect(i) 855 | } 856 | 857 | if (rand.Int() % 1000) < 500 { 858 | i := rand.Int() % servers 859 | if cfg.rafts[i] == nil { 860 | cfg.start1(i) 861 | } 862 | cfg.connect(i) 863 | } 864 | 865 | if (rand.Int() % 1000) < 200 { 866 | i := rand.Int() % servers 867 | if cfg.rafts[i] != nil { 868 | cfg.crash1(i) 869 | } 870 | } 871 | 872 | // Make crash/restart infrequent enough that the peers can often 873 | // keep up, but not so infrequent that everything has settled 874 | // down from one change to the next. Pick a value smaller than 875 | // the election timeout, but not hugely smaller. 876 | time.Sleep((RaftElectionTimeout * 7) / 10) 877 | } 878 | 879 | time.Sleep(RaftElectionTimeout) 880 | cfg.setunreliable(false) 881 | for i := 0; i < servers; i++ { 882 | if cfg.rafts[i] == nil { 883 | cfg.start1(i) 884 | } 885 | cfg.connect(i) 886 | } 887 | 888 | atomic.StoreInt32(&stop, 1) 889 | 890 | values := []int{} 891 | for i := 0; i < ncli; i++ { 892 | vv := <-cha[i] 893 | if vv == nil { 894 | t.Fatal("client failed") 895 | } 896 | values = append(values, vv...) 897 | } 898 | 899 | time.Sleep(RaftElectionTimeout) 900 | 901 | lastIndex := cfg.one(rand.Int(), servers, true) 902 | 903 | really := make([]int, lastIndex+1) 904 | for index := 1; index <= lastIndex; index++ { 905 | v := cfg.wait(index, servers, -1) 906 | if vi, ok := v.(int); ok { 907 | really = append(really, vi) 908 | } else { 909 | t.Fatalf("not an int") 910 | } 911 | } 912 | 913 | for _, v1 := range values { 914 | ok := false 915 | for _, v2 := range really { 916 | if v1 == v2 { 917 | ok = true 918 | } 919 | } 920 | if ok == false { 921 | cfg.t.Fatalf("didn't find a value") 922 | } 923 | } 924 | 925 | cfg.end() 926 | } 927 | 928 | func TestReliableChurn2C(t *testing.T) { 929 | internalChurn(t, false) 930 | } 931 | 932 | func TestUnreliableChurn2C(t *testing.T) { 933 | internalChurn(t, true) 934 | } 935 | -------------------------------------------------------------------------------- /Raft/code/util.go: -------------------------------------------------------------------------------- 1 | package raft 2 | 3 | import "log" 4 | 5 | // Debugging is 6 | const Debugging = 0 7 | 8 | // DPrintf is 9 | func DPrintf(format string, a ...interface{}) { 10 | if Debugging > 0 { 11 | log.Printf(format, a...) 12 | } 13 | } 14 | 15 | func min(a, b int) int { 16 | if a < b { 17 | return a 18 | } 19 | return b 20 | } 21 | -------------------------------------------------------------------------------- /Raft/raft-extended.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aQuaYi/Distributed-Algorithms/8c27e1220fb1c467da999f30244d40f520365522/Raft/raft-extended.pdf -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | echo "" > coverage.txt 5 | 6 | for d in $(go list ./... | grep -v vendor); do 7 | echo $d 8 | go test -coverprofile=profile.out -covermode=atomic $d 9 | if [ -f profile.out ]; then 10 | cat profile.out >> coverage.txt 11 | rm profile.out 12 | fi 13 | done --------------------------------------------------------------------------------