├── .gitignore
├── MAPREDUCE.md
├── Makefile
├── RAFT.md
├── README.md
├── resource
    └── mr.png
└── src
    ├── .gitignore
    ├── kvraft
        ├── client.go
        ├── common.go
        ├── config.go
        ├── server.go
        └── test_test.go
    ├── labgob
        ├── labgob.go
        └── test_test.go
    ├── labrpc
        ├── labrpc.go
        └── test_test.go
    ├── linearizability
        ├── bitset.go
        ├── linearizability.go
        ├── model.go
        └── models.go
    ├── main
        ├── diskvd.go
        ├── lockc.go
        ├── lockd.go
        ├── mrmaster.go
        ├── mrsequential.go
        ├── mrworker.go
        ├── pbc.go
        ├── pbd.go
        ├── pg-being_ernest.txt
        ├── pg-dorian_gray.txt
        ├── pg-frankenstein.txt
        ├── pg-grimm.txt
        ├── pg-huckleberry_finn.txt
        ├── pg-metamorphosis.txt
        ├── pg-sherlock_holmes.txt
        ├── pg-tom_sawyer.txt
        ├── test-mr.sh
        └── viewd.go
    ├── mr
        ├── master.go
        ├── rpc.go
        └── worker.go
    ├── mrapps
        ├── crash.go
        ├── indexer.go
        ├── mtiming.go
        ├── nocrash.go
        ├── rtiming.go
        └── wc.go
    ├── mroriginal
        ├── master.go
        ├── rpc.go
        └── worker.go
    ├── raft
        ├── config.go
        ├── persister.go
        ├── raft.go
        ├── test_test.go
        └── util.go
    ├── shardkv
        ├── client.go
        ├── common.go
        ├── config.go
        ├── server.go
        └── test_test.go
    └── shardmaster
        ├── client.go
        ├── common.go
        ├── config.go
        ├── server.go
        └── test_test.go


/.gitignore:
--------------------------------------------------------------------------------
1 | pkg/
2 | api.key
3 | *-handin.tar.gz
4 | .idea
5 | src/main/test-mr.sh


--------------------------------------------------------------------------------
/MAPREDUCE.md:
--------------------------------------------------------------------------------
 1 | # Lab1 实现一个MapReduce框架
 2 | ## 0.运行一个顺序执行的 **mapreduce** 例子体验一下程序逻辑
 3 |  ```bash
 4 | cd src/main
 5 | // 将用户自定义的函数编译为动态连接库来执行
 6 | go build -buildmode=plugin ../mrapps/wc.go
 7 | rm mr-out*
 8 | go run mrsequential.go wc.so pg*.txt
 9 | more mr-out-0
10 |  ```
11 | ## 1. 如何启动一个分布式的 **mapreduce**？
12 | ### 启动一个 master
13 | ```bash
14 | cd src/main
15 | go build -buildmode=plugin ../mrapps/wc.go
16 | rm mr-out*
17 | go run mrmaster.go pg-*.txt
18 | ```
19 | ###  启动多个 worker
20 | 打开多个终端 进入到项目的`main`下执行`go run mrworker.go wc.so`
21 |      
22 | ## 2. 完成实验
23 | ###  编码
24 | 1. 在 `src/mr` 目录下完成实验内容
25 | 2. `mrmaster.go`会调用你编写的 `src/mr/master.go`代码
26 | 3. `mrworker.go`会调用`src/mr/worker.go`的代码
27 | 4. 二者通信的代码在`src/mr/rpc.go`中自行实现
28 | 5. 我已经完成了实验代码,在`src/mroriginal`目录下有原始的实验框架是我留给你来独立完成的
29 | 6. 将mr文件夹进行覆盖即可:`mv src/mroriginal src/mr` 或者备份下留作参考😁
30 | 
31 | **ps:在`src/mrapps`目录下的是**MR**相关的应用函数**
32 |  
33 | ### 调试
34 | 通过与`cat mr-out-* | sort` 输出的结果来对比可以检查你编写的MR框架是否运行正确
35 | 
36 | ### 测试
37 | 在 `src/main`目录下执行 `sh test-mr.sh`由于master程序默认永远不退出,脚本执行后注意 `killall mr*` 释放掉资源.
38 | 
39 | ## 更多Lab1实验的内容
40 | [请点击这里](http://nil.csail.mit.edu/6.824/2020/labs/lab-mr.html)
41 | 
42 | ## 通过截图
43 | ![mr](resource/mr.png)
44 | 
45 | ## 总结
46 | 1. 虽然通过了全部测试,但是感觉还有很多地方没有完善 
47 | 2. 在设计之初没有思考清楚资源管理导致后来很难进行管理
48 | 3. 学到一个工程上的原则: 在设计之初解决必要的问题才是高效产出的关键
49 | 
50 | ## TODO
51 | - [ ] 完成job后合理的释放资源
52 | 
53 | - [ ] worker与master的通信异常状态处理
54 | 
55 | - [ ] 优雅关闭整个集群
56 |     - [ ] 在关闭集群的时候要合理释放job依赖资源以及
57 |     - [ ] 当job完成时应拒绝worker任务的注册与获取任务请求
58 | 
59 | - [ ] 发展为生产级别的嵌入式MapReduce计算框架
60 |     - [ ] master 支持同时运行多个不同的job worker
61 |     - [ ] 支持在异步网络下更加健壮的传输数据
62 | 
63 | - [ ] 保证Map任务提交的原子性(产生map中间文件时,无法保证其幂等性的消息提交以及文件生成)
64 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # This is the Makefile helping you submit the labs.  
 2 | # Just create 6.824/api.key with your API key in it, 
 3 | # and submit your lab with the following command: 
 4 | #     $ make [lab1|lab2a|lab2b|lab2c|lab3a|lab3b|lab4a|lab4b]
 5 | 
 6 | LABS=" lab1 lab2a lab2b lab2c lab3a lab3b lab4a lab4b "
 7 | 
 8 | %:
 9 | 	@echo "Preparing $@-handin.tar.gz"
10 | 	@echo "Checking for committed temporary files..."
11 | 	@if git ls-files | grep -E 'mr-tmp|mrinput' > /dev/null; then \
12 | 		echo "" ; \
13 | 		echo "OBS! You have committed some large temporary files:" ; \
14 | 		echo "" ; \
15 | 		git ls-files | grep -E 'mr-tmp|mrinput' | sed 's/^/\t/' ; \
16 | 		echo "" ; \
17 | 		echo "Follow the instructions at http://stackoverflow.com/a/308684/472927" ; \
18 | 		echo "to remove them, and then run make again." ; \
19 | 		echo "" ; \
20 | 		exit 1 ; \
21 | 	fi
22 | 	@if echo $(LABS) | grep -q " $@ " ; then \
23 | 		echo "Tarring up your submission..." ; \
24 | 		tar cvzf $@-handin.tar.gz \
25 | 			"--exclude=src/main/pg-*.txt" \
26 | 			"--exclude=src/main/diskvd" \
27 | 			"--exclude=src/mapreduce/824-mrinput-*.txt" \
28 | 			"--exclude=src/main/mr-*" \
29 | 			"--exclude=mrtmp.*" \
30 | 			"--exclude=src/main/diff.out" \
31 | 			Makefile src; \
32 | 		if ! test -e api.key ; then \
33 | 			echo "Missing $(PWD)/api.key. Please create the file with your key in it or submit the $@-handin.tar.gz via the web interface."; \
34 | 		else \
35 | 			echo "Are you sure you want to submit $@? Enter 'yes' to continue:"; \
36 | 			read line; \
37 | 			if test "$$line" != "yes" ; then echo "Giving up submission"; exit; fi; \
38 | 			if test `stat -c "%s" "$@-handin.tar.gz" 2>/dev/null || stat -f "%z" "$@-handin.tar.gz"` -ge 20971520 ; then echo "File exceeds 20MB."; exit; fi; \
39 | 			mv api.key api.key.fix ; \
40 | 			cat api.key.fix | tr -d '\n' > api.key ; \
41 | 			rm api.key.fix ; \
42 | 			curl -F file=@$@-handin.tar.gz -F "key=<api.key" \
43 | 			https://6824.scripts.mit.edu/2018/handin.py/upload > /dev/null || { \
44 | 				echo ; \
45 | 				echo "Submit seems to have failed."; \
46 | 				echo "Please upload the tarball manually on the submission website."; } \
47 | 		fi; \
48 | 	else \
49 | 		echo "Bad target $@. Usage: make [$(LABS)]"; \
50 | 	fi
51 | 


--------------------------------------------------------------------------------
/RAFT.md:
--------------------------------------------------------------------------------
 1 | ## lab 2A 实现 raft 的选举与心跳
 2 | 耗时 13小时完成(每天做实验1小时,思路分散还是影响进度的)
 3 | 耗时的主要原因在于`分析问题不够清晰`, 主要遇到的问题有:
 4 |  1. 心跳时间与选举超时时间的比重调整的不好
 5 |  2. 集群角色的变更没有调度好,在一些关键行为后没有对角色正确的设置
 6 |  3. 太过于自信,应该先写注释在写代码,必须全面逻辑通顺了再写代码,这样才能减少bug的出现提高效率,以后要一口气把逻辑用注释写好后再开发
 7 |  
 8 | ### 一些感觉没有完成的事情
 9 | TODO:
10 | - [ ] 选举/心跳等过程 仅对raft的状态添加原子锁保证了状态更新的并发安全,但是没有保证多个状态之间的事务性可能存在潜在的风险(这可能是偶尔不AC的原因)
11 | - [ ] 偶尔还是有测试不通过的问题,我姑且认为是时间可用性的影响吧 今后实验遇到问题再修复
12 | 
13 | ### 最后的总结
14 | 1. 最开始的设计 应该只考虑必要的事情,简化复杂度 才能专注重要的事情
15 | 2. 保证逻辑连贯性的方法 就是一口气把关键逻辑的注释先写好 抛开代码 用伪代码思考逻辑, 大部分的问题 都是逻辑问题
16 | 3. 问题通常很简单,只是没有遵循思考的章法


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 介绍
2 | 完成 mit 6.824 2020年(春)课程学习 lab1-4 实验记录,该项目仅改变了实验中的一些包的位置以便于在`$GOPATH/src`目录下`clone`此项目来正确的运行实验
3 | # 价值
4 | 该仓库旨在帮助学习6824的同学们快速搭建实验环境,并且提供了我完成实验的答案作为参考,欢迎同学们提供PR来完善不同时期的6824课程,方便中国学生们学习分布式技术。
5 | # 声明
6 | 仅作为个人学习使用,强烈建议同学们自己完成实验以学到更多,实验答案可以帮助同学们形成自己的思路,而非提供抄袭的可能!!!
7 | 


--------------------------------------------------------------------------------
/resource/mr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logikoisto/Mit6824/820d278e1ff0dd86f74bd3929550c7d23291f6b2/resource/mr.png


--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
 1 | *.*/
 2 | mrtmp.*
 3 | 824-mrinput-*.txt
 4 | /main/diff.out
 5 | /mapreduce/x.txt
 6 | /pbservice/x.txt
 7 | /kvpaxos/x.txt
 8 | /main/wc.so
 9 | /main/mr-out-0
10 | 


--------------------------------------------------------------------------------
/src/kvraft/client.go:
--------------------------------------------------------------------------------
 1 | package kvraft
 2 | 
 3 | import "labrpc"
 4 | import "crypto/rand"
 5 | import "math/big"
 6 | 
 7 | type Clerk struct {
 8 | 	servers []*labrpc.ClientEnd
 9 | 	// You will have to modify this struct.
10 | }
11 | 
12 | func nrand() int64 {
13 | 	max := big.NewInt(int64(1) << 62)
14 | 	bigx, _ := rand.Int(rand.Reader, max)
15 | 	x := bigx.Int64()
16 | 	return x
17 | }
18 | 
19 | func MakeClerk(servers []*labrpc.ClientEnd) *Clerk {
20 | 	ck := new(Clerk)
21 | 	ck.servers = servers
22 | 	// You'll have to add code here.
23 | 	return ck
24 | }
25 | 
26 | //
27 | // fetch the current value for a key.
28 | // returns "" if the key does not exist.
29 | // keeps trying forever in the face of all other errors.
30 | //
31 | // you can send an RPC with code like this:
32 | // ok := ck.servers[i].Call("KVServer.Get", &args, &reply)
33 | //
34 | // the types of args and reply (including whether they are pointers)
35 | // must match the declared types of the RPC handler function's
36 | // arguments. and reply must be passed as a pointer.
37 | //
38 | func (ck *Clerk) Get(key string) string {
39 | 
40 | 	// You will have to modify this function.
41 | 	return ""
42 | }
43 | 
44 | //
45 | // shared by Put and Append.
46 | //
47 | // you can send an RPC with code like this:
48 | // ok := ck.servers[i].Call("KVServer.PutAppend", &args, &reply)
49 | //
50 | // the types of args and reply (including whether they are pointers)
51 | // must match the declared types of the RPC handler function's
52 | // arguments. and reply must be passed as a pointer.
53 | //
54 | func (ck *Clerk) PutAppend(key string, value string, op string) {
55 | 	// You will have to modify this function.
56 | }
57 | 
58 | func (ck *Clerk) Put(key string, value string) {
59 | 	ck.PutAppend(key, value, "Put")
60 | }
61 | func (ck *Clerk) Append(key string, value string) {
62 | 	ck.PutAppend(key, value, "Append")
63 | }
64 | 


--------------------------------------------------------------------------------
/src/kvraft/common.go:
--------------------------------------------------------------------------------
 1 | package kvraft
 2 | 
 3 | const (
 4 | 	OK             = "OK"
 5 | 	ErrNoKey       = "ErrNoKey"
 6 | 	ErrWrongLeader = "ErrWrongLeader"
 7 | )
 8 | 
 9 | type Err string
10 | 
11 | // Put or Append
12 | type PutAppendArgs struct {
13 | 	Key   string
14 | 	Value string
15 | 	Op    string // "Put" or "Append"
16 | 	// You'll have to add definitions here.
17 | 	// Field names must start with capital letters,
18 | 	// otherwise RPC will break.
19 | }
20 | 
21 | type PutAppendReply struct {
22 | 	Err Err
23 | }
24 | 
25 | type GetArgs struct {
26 | 	Key string
27 | 	// You'll have to add definitions here.
28 | }
29 | 
30 | type GetReply struct {
31 | 	Err   Err
32 | 	Value string
33 | }
34 | 


--------------------------------------------------------------------------------
/src/kvraft/config.go:
--------------------------------------------------------------------------------
  1 | package kvraft
  2 | 
  3 | import "labrpc"
  4 | import "testing"
  5 | import "os"
  6 | 
  7 | // import "log"
  8 | import crand "crypto/rand"
  9 | import "math/big"
 10 | import "math/rand"
 11 | import "encoding/base64"
 12 | import "sync"
 13 | import "runtime"
 14 | import "raft"
 15 | import "fmt"
 16 | import "time"
 17 | import "sync/atomic"
 18 | 
 19 | func randstring(n int) string {
 20 | 	b := make([]byte, 2*n)
 21 | 	crand.Read(b)
 22 | 	s := base64.URLEncoding.EncodeToString(b)
 23 | 	return s[0:n]
 24 | }
 25 | 
 26 | func makeSeed() int64 {
 27 | 	max := big.NewInt(int64(1) << 62)
 28 | 	bigx, _ := crand.Int(crand.Reader, max)
 29 | 	x := bigx.Int64()
 30 | 	return x
 31 | }
 32 | 
 33 | // Randomize server handles
 34 | func random_handles(kvh []*labrpc.ClientEnd) []*labrpc.ClientEnd {
 35 | 	sa := make([]*labrpc.ClientEnd, len(kvh))
 36 | 	copy(sa, kvh)
 37 | 	for i := range sa {
 38 | 		j := rand.Intn(i + 1)
 39 | 		sa[i], sa[j] = sa[j], sa[i]
 40 | 	}
 41 | 	return sa
 42 | }
 43 | 
 44 | type config struct {
 45 | 	mu           sync.Mutex
 46 | 	t            *testing.T
 47 | 	net          *labrpc.Network
 48 | 	n            int
 49 | 	kvservers    []*KVServer
 50 | 	saved        []*raft.Persister
 51 | 	endnames     [][]string // names of each server's sending ClientEnds
 52 | 	clerks       map[*Clerk][]string
 53 | 	nextClientId int
 54 | 	maxraftstate int
 55 | 	start        time.Time // time at which make_config() was called
 56 | 	// begin()/end() statistics
 57 | 	t0    time.Time // time at which test_test.go called cfg.begin()
 58 | 	rpcs0 int       // rpcTotal() at start of test
 59 | 	ops   int32     // number of clerk get/put/append method calls
 60 | }
 61 | 
 62 | func (cfg *config) checkTimeout() {
 63 | 	// enforce a two minute real-time limit on each test
 64 | 	if !cfg.t.Failed() && time.Since(cfg.start) > 120*time.Second {
 65 | 		cfg.t.Fatal("test took longer than 120 seconds")
 66 | 	}
 67 | }
 68 | 
 69 | func (cfg *config) cleanup() {
 70 | 	cfg.mu.Lock()
 71 | 	defer cfg.mu.Unlock()
 72 | 	for i := 0; i < len(cfg.kvservers); i++ {
 73 | 		if cfg.kvservers[i] != nil {
 74 | 			cfg.kvservers[i].Kill()
 75 | 		}
 76 | 	}
 77 | 	cfg.net.Cleanup()
 78 | 	cfg.checkTimeout()
 79 | }
 80 | 
 81 | // Maximum log size across all servers
 82 | func (cfg *config) LogSize() int {
 83 | 	logsize := 0
 84 | 	for i := 0; i < cfg.n; i++ {
 85 | 		n := cfg.saved[i].RaftStateSize()
 86 | 		if n > logsize {
 87 | 			logsize = n
 88 | 		}
 89 | 	}
 90 | 	return logsize
 91 | }
 92 | 
 93 | // Maximum snapshot size across all servers
 94 | func (cfg *config) SnapshotSize() int {
 95 | 	snapshotsize := 0
 96 | 	for i := 0; i < cfg.n; i++ {
 97 | 		n := cfg.saved[i].SnapshotSize()
 98 | 		if n > snapshotsize {
 99 | 			snapshotsize = n
100 | 		}
101 | 	}
102 | 	return snapshotsize
103 | }
104 | 
105 | // attach server i to servers listed in to
106 | // caller must hold cfg.mu
107 | func (cfg *config) connectUnlocked(i int, to []int) {
108 | 	// log.Printf("connect peer %d to %v\n", i, to)
109 | 
110 | 	// outgoing socket files
111 | 	for j := 0; j < len(to); j++ {
112 | 		endname := cfg.endnames[i][to[j]]
113 | 		cfg.net.Enable(endname, true)
114 | 	}
115 | 
116 | 	// incoming socket files
117 | 	for j := 0; j < len(to); j++ {
118 | 		endname := cfg.endnames[to[j]][i]
119 | 		cfg.net.Enable(endname, true)
120 | 	}
121 | }
122 | 
123 | func (cfg *config) connect(i int, to []int) {
124 | 	cfg.mu.Lock()
125 | 	defer cfg.mu.Unlock()
126 | 	cfg.connectUnlocked(i, to)
127 | }
128 | 
129 | // detach server i from the servers listed in from
130 | // caller must hold cfg.mu
131 | func (cfg *config) disconnectUnlocked(i int, from []int) {
132 | 	// log.Printf("disconnect peer %d from %v\n", i, from)
133 | 
134 | 	// outgoing socket files
135 | 	for j := 0; j < len(from); j++ {
136 | 		if cfg.endnames[i] != nil {
137 | 			endname := cfg.endnames[i][from[j]]
138 | 			cfg.net.Enable(endname, false)
139 | 		}
140 | 	}
141 | 
142 | 	// incoming socket files
143 | 	for j := 0; j < len(from); j++ {
144 | 		if cfg.endnames[j] != nil {
145 | 			endname := cfg.endnames[from[j]][i]
146 | 			cfg.net.Enable(endname, false)
147 | 		}
148 | 	}
149 | }
150 | 
151 | func (cfg *config) disconnect(i int, from []int) {
152 | 	cfg.mu.Lock()
153 | 	defer cfg.mu.Unlock()
154 | 	cfg.disconnectUnlocked(i, from)
155 | }
156 | 
157 | func (cfg *config) All() []int {
158 | 	all := make([]int, cfg.n)
159 | 	for i := 0; i < cfg.n; i++ {
160 | 		all[i] = i
161 | 	}
162 | 	return all
163 | }
164 | 
165 | func (cfg *config) ConnectAll() {
166 | 	cfg.mu.Lock()
167 | 	defer cfg.mu.Unlock()
168 | 	for i := 0; i < cfg.n; i++ {
169 | 		cfg.connectUnlocked(i, cfg.All())
170 | 	}
171 | }
172 | 
173 | // Sets up 2 partitions with connectivity between servers in each  partition.
174 | func (cfg *config) partition(p1 []int, p2 []int) {
175 | 	cfg.mu.Lock()
176 | 	defer cfg.mu.Unlock()
177 | 	// log.Printf("partition servers into: %v %v\n", p1, p2)
178 | 	for i := 0; i < len(p1); i++ {
179 | 		cfg.disconnectUnlocked(p1[i], p2)
180 | 		cfg.connectUnlocked(p1[i], p1)
181 | 	}
182 | 	for i := 0; i < len(p2); i++ {
183 | 		cfg.disconnectUnlocked(p2[i], p1)
184 | 		cfg.connectUnlocked(p2[i], p2)
185 | 	}
186 | }
187 | 
188 | // Create a clerk with clerk specific server names.
189 | // Give it connections to all of the servers, but for
190 | // now enable only connections to servers in to[].
191 | func (cfg *config) makeClient(to []int) *Clerk {
192 | 	cfg.mu.Lock()
193 | 	defer cfg.mu.Unlock()
194 | 
195 | 	// a fresh set of ClientEnds.
196 | 	ends := make([]*labrpc.ClientEnd, cfg.n)
197 | 	endnames := make([]string, cfg.n)
198 | 	for j := 0; j < cfg.n; j++ {
199 | 		endnames[j] = randstring(20)
200 | 		ends[j] = cfg.net.MakeEnd(endnames[j])
201 | 		cfg.net.Connect(endnames[j], j)
202 | 	}
203 | 
204 | 	ck := MakeClerk(random_handles(ends))
205 | 	cfg.clerks[ck] = endnames
206 | 	cfg.nextClientId++
207 | 	cfg.ConnectClientUnlocked(ck, to)
208 | 	return ck
209 | }
210 | 
211 | func (cfg *config) deleteClient(ck *Clerk) {
212 | 	cfg.mu.Lock()
213 | 	defer cfg.mu.Unlock()
214 | 
215 | 	v := cfg.clerks[ck]
216 | 	for i := 0; i < len(v); i++ {
217 | 		os.Remove(v[i])
218 | 	}
219 | 	delete(cfg.clerks, ck)
220 | }
221 | 
222 | // caller should hold cfg.mu
223 | func (cfg *config) ConnectClientUnlocked(ck *Clerk, to []int) {
224 | 	// log.Printf("ConnectClient %v to %v\n", ck, to)
225 | 	endnames := cfg.clerks[ck]
226 | 	for j := 0; j < len(to); j++ {
227 | 		s := endnames[to[j]]
228 | 		cfg.net.Enable(s, true)
229 | 	}
230 | }
231 | 
232 | func (cfg *config) ConnectClient(ck *Clerk, to []int) {
233 | 	cfg.mu.Lock()
234 | 	defer cfg.mu.Unlock()
235 | 	cfg.ConnectClientUnlocked(ck, to)
236 | }
237 | 
238 | // caller should hold cfg.mu
239 | func (cfg *config) DisconnectClientUnlocked(ck *Clerk, from []int) {
240 | 	// log.Printf("DisconnectClient %v from %v\n", ck, from)
241 | 	endnames := cfg.clerks[ck]
242 | 	for j := 0; j < len(from); j++ {
243 | 		s := endnames[from[j]]
244 | 		cfg.net.Enable(s, false)
245 | 	}
246 | }
247 | 
248 | func (cfg *config) DisconnectClient(ck *Clerk, from []int) {
249 | 	cfg.mu.Lock()
250 | 	defer cfg.mu.Unlock()
251 | 	cfg.DisconnectClientUnlocked(ck, from)
252 | }
253 | 
254 | // Shutdown a server by isolating it
255 | func (cfg *config) ShutdownServer(i int) {
256 | 	cfg.mu.Lock()
257 | 	defer cfg.mu.Unlock()
258 | 
259 | 	cfg.disconnectUnlocked(i, cfg.All())
260 | 
261 | 	// disable client connections to the server.
262 | 	// it's important to do this before creating
263 | 	// the new Persister in saved[i], to avoid
264 | 	// the possibility of the server returning a
265 | 	// positive reply to an Append but persisting
266 | 	// the result in the superseded Persister.
267 | 	cfg.net.DeleteServer(i)
268 | 
269 | 	// a fresh persister, in case old instance
270 | 	// continues to update the Persister.
271 | 	// but copy old persister's content so that we always
272 | 	// pass Make() the last persisted state.
273 | 	if cfg.saved[i] != nil {
274 | 		cfg.saved[i] = cfg.saved[i].Copy()
275 | 	}
276 | 
277 | 	kv := cfg.kvservers[i]
278 | 	if kv != nil {
279 | 		cfg.mu.Unlock()
280 | 		kv.Kill()
281 | 		cfg.mu.Lock()
282 | 		cfg.kvservers[i] = nil
283 | 	}
284 | }
285 | 
286 | // If restart servers, first call ShutdownServer
287 | func (cfg *config) StartServer(i int) {
288 | 	cfg.mu.Lock()
289 | 
290 | 	// a fresh set of outgoing ClientEnd names.
291 | 	cfg.endnames[i] = make([]string, cfg.n)
292 | 	for j := 0; j < cfg.n; j++ {
293 | 		cfg.endnames[i][j] = randstring(20)
294 | 	}
295 | 
296 | 	// a fresh set of ClientEnds.
297 | 	ends := make([]*labrpc.ClientEnd, cfg.n)
298 | 	for j := 0; j < cfg.n; j++ {
299 | 		ends[j] = cfg.net.MakeEnd(cfg.endnames[i][j])
300 | 		cfg.net.Connect(cfg.endnames[i][j], j)
301 | 	}
302 | 
303 | 	// a fresh persister, so old instance doesn't overwrite
304 | 	// new instance's persisted state.
305 | 	// give the fresh persister a copy of the old persister's
306 | 	// state, so that the spec is that we pass StartKVServer()
307 | 	// the last persisted state.
308 | 	if cfg.saved[i] != nil {
309 | 		cfg.saved[i] = cfg.saved[i].Copy()
310 | 	} else {
311 | 		cfg.saved[i] = raft.MakePersister()
312 | 	}
313 | 	cfg.mu.Unlock()
314 | 
315 | 	cfg.kvservers[i] = StartKVServer(ends, i, cfg.saved[i], cfg.maxraftstate)
316 | 
317 | 	kvsvc := labrpc.MakeService(cfg.kvservers[i])
318 | 	rfsvc := labrpc.MakeService(cfg.kvservers[i].rf)
319 | 	srv := labrpc.MakeServer()
320 | 	srv.AddService(kvsvc)
321 | 	srv.AddService(rfsvc)
322 | 	cfg.net.AddServer(i, srv)
323 | }
324 | 
325 | func (cfg *config) Leader() (bool, int) {
326 | 	cfg.mu.Lock()
327 | 	defer cfg.mu.Unlock()
328 | 
329 | 	for i := 0; i < cfg.n; i++ {
330 | 		_, is_leader := cfg.kvservers[i].rf.GetState()
331 | 		if is_leader {
332 | 			return true, i
333 | 		}
334 | 	}
335 | 	return false, 0
336 | }
337 | 
338 | // Partition servers into 2 groups and put current leader in minority
339 | func (cfg *config) make_partition() ([]int, []int) {
340 | 	_, l := cfg.Leader()
341 | 	p1 := make([]int, cfg.n/2+1)
342 | 	p2 := make([]int, cfg.n/2)
343 | 	j := 0
344 | 	for i := 0; i < cfg.n; i++ {
345 | 		if i != l {
346 | 			if j < len(p1) {
347 | 				p1[j] = i
348 | 			} else {
349 | 				p2[j-len(p1)] = i
350 | 			}
351 | 			j++
352 | 		}
353 | 	}
354 | 	p2[len(p2)-1] = l
355 | 	return p1, p2
356 | }
357 | 
358 | var ncpu_once sync.Once
359 | 
360 | func make_config(t *testing.T, n int, unreliable bool, maxraftstate int) *config {
361 | 	ncpu_once.Do(func() {
362 | 		if runtime.NumCPU() < 2 {
363 | 			fmt.Printf("warning: only one CPU, which may conceal locking bugs\n")
364 | 		}
365 | 		rand.Seed(makeSeed())
366 | 	})
367 | 	runtime.GOMAXPROCS(4)
368 | 	cfg := &config{}
369 | 	cfg.t = t
370 | 	cfg.net = labrpc.MakeNetwork()
371 | 	cfg.n = n
372 | 	cfg.kvservers = make([]*KVServer, cfg.n)
373 | 	cfg.saved = make([]*raft.Persister, cfg.n)
374 | 	cfg.endnames = make([][]string, cfg.n)
375 | 	cfg.clerks = make(map[*Clerk][]string)
376 | 	cfg.nextClientId = cfg.n + 1000 // client ids start 1000 above the highest serverid
377 | 	cfg.maxraftstate = maxraftstate
378 | 	cfg.start = time.Now()
379 | 
380 | 	// create a full set of KV servers.
381 | 	for i := 0; i < cfg.n; i++ {
382 | 		cfg.StartServer(i)
383 | 	}
384 | 
385 | 	cfg.ConnectAll()
386 | 
387 | 	cfg.net.Reliable(!unreliable)
388 | 
389 | 	return cfg
390 | }
391 | 
392 | func (cfg *config) rpcTotal() int {
393 | 	return cfg.net.GetTotalCount()
394 | }
395 | 
396 | // start a Test.
397 | // print the Test message.
398 | // e.g. cfg.begin("Test (2B): RPC counts aren't too high")
399 | func (cfg *config) begin(description string) {
400 | 	fmt.Printf("%s ...\n", description)
401 | 	cfg.t0 = time.Now()
402 | 	cfg.rpcs0 = cfg.rpcTotal()
403 | 	atomic.StoreInt32(&cfg.ops, 0)
404 | }
405 | 
406 | func (cfg *config) op() {
407 | 	atomic.AddInt32(&cfg.ops, 1)
408 | }
409 | 
410 | // end a Test -- the fact that we got here means there
411 | // was no failure.
412 | // print the Passed message,
413 | // and some performance numbers.
414 | func (cfg *config) end() {
415 | 	cfg.checkTimeout()
416 | 	if cfg.t.Failed() == false {
417 | 		t := time.Since(cfg.t0).Seconds()  // real time
418 | 		npeers := cfg.n                    // number of Raft peers
419 | 		nrpc := cfg.rpcTotal() - cfg.rpcs0 // number of RPC sends
420 | 		ops := atomic.LoadInt32(&cfg.ops)  //  number of clerk get/put/append calls
421 | 
422 | 		fmt.Printf("  ... Passed --")
423 | 		fmt.Printf("  %4.1f  %d %5d %4d\n", t, npeers, nrpc, ops)
424 | 	}
425 | }
426 | 


--------------------------------------------------------------------------------
/src/kvraft/server.go:
--------------------------------------------------------------------------------
  1 | package kvraft
  2 | 
  3 | import (
  4 | 	"labgob"
  5 | 	"labrpc"
  6 | 	"log"
  7 | 	"raft"
  8 | 	"sync"
  9 | 	"sync/atomic"
 10 | )
 11 | 
 12 | const Debug = 0
 13 | 
 14 | func DPrintf(format string, a ...interface{}) (n int, err error) {
 15 | 	if Debug > 0 {
 16 | 		log.Printf(format, a...)
 17 | 	}
 18 | 	return
 19 | }
 20 | 
 21 | type Op struct {
 22 | 	// Your definitions here.
 23 | 	// Field names must start with capital letters,
 24 | 	// otherwise RPC will break.
 25 | }
 26 | 
 27 | type KVServer struct {
 28 | 	mu      sync.Mutex
 29 | 	me      int
 30 | 	rf      *raft.Raft
 31 | 	applyCh chan raft.ApplyMsg
 32 | 	dead    int32 // set by Kill()
 33 | 
 34 | 	maxraftstate int // snapshot if log grows this big
 35 | 
 36 | 	// Your definitions here.
 37 | }
 38 | 
 39 | func (kv *KVServer) Get(args *GetArgs, reply *GetReply) {
 40 | 	// Your code here.
 41 | }
 42 | 
 43 | func (kv *KVServer) PutAppend(args *PutAppendArgs, reply *PutAppendReply) {
 44 | 	// Your code here.
 45 | }
 46 | 
 47 | //
 48 | // the tester calls Kill() when a KVServer instance won't
 49 | // be needed again. for your convenience, we supply
 50 | // code to set rf.dead (without needing a lock),
 51 | // and a killed() method to test rf.dead in
 52 | // long-running loops. you can also add your own
 53 | // code to Kill(). you're not required to do anything
 54 | // about this, but it may be convenient (for example)
 55 | // to suppress debug output from a Kill()ed instance.
 56 | //
 57 | func (kv *KVServer) Kill() {
 58 | 	atomic.StoreInt32(&kv.dead, 1)
 59 | 	kv.rf.Kill()
 60 | 	// Your code here, if desired.
 61 | }
 62 | 
 63 | func (kv *KVServer) killed() bool {
 64 | 	z := atomic.LoadInt32(&kv.dead)
 65 | 	return z == 1
 66 | }
 67 | 
 68 | //
 69 | // servers[] contains the ports of the set of
 70 | // servers that will cooperate via Raft to
 71 | // form the fault-tolerant key/value service.
 72 | // me is the index of the current server in servers[].
 73 | // the k/v server should store snapshots through the underlying Raft
 74 | // implementation, which should call persister.SaveStateAndSnapshot() to
 75 | // atomically save the Raft state along with the snapshot.
 76 | // the k/v server should snapshot when Raft's saved state exceeds maxraftstate bytes,
 77 | // in order to allow Raft to garbage-collect its log. if maxraftstate is -1,
 78 | // you don't need to snapshot.
 79 | // StartKVServer() must return quickly, so it should start goroutines
 80 | // for any long-running work.
 81 | //
 82 | func StartKVServer(servers []*labrpc.ClientEnd, me int, persister *raft.Persister, maxraftstate int) *KVServer {
 83 | 	// call labgob.Register on structures you want
 84 | 	// Go's RPC library to marshall/unmarshall.
 85 | 	labgob.Register(Op{})
 86 | 
 87 | 	kv := new(KVServer)
 88 | 	kv.me = me
 89 | 	kv.maxraftstate = maxraftstate
 90 | 
 91 | 	// You may need initialization code here.
 92 | 
 93 | 	kv.applyCh = make(chan raft.ApplyMsg)
 94 | 	kv.rf = raft.Make(servers, me, persister, kv.applyCh)
 95 | 
 96 | 	// You may need initialization code here.
 97 | 
 98 | 	return kv
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/labgob/labgob.go:
--------------------------------------------------------------------------------
  1 | package labgob
  2 | 
  3 | //
  4 | // trying to send non-capitalized fields over RPC produces a range of
  5 | // misbehavior, including both mysterious incorrect computation and
  6 | // outright crashes. so this wrapper around Go's encoding/gob warns
  7 | // about non-capitalized field names.
  8 | //
  9 | 
 10 | import "encoding/gob"
 11 | import "io"
 12 | import "reflect"
 13 | import "fmt"
 14 | import "sync"
 15 | import "unicode"
 16 | import "unicode/utf8"
 17 | 
 18 | var mu sync.Mutex
 19 | var errorCount int // for TestCapital
 20 | var checked map[reflect.Type]bool
 21 | 
 22 | type LabEncoder struct {
 23 | 	gob *gob.Encoder
 24 | }
 25 | 
 26 | func NewEncoder(w io.Writer) *LabEncoder {
 27 | 	enc := &LabEncoder{}
 28 | 	enc.gob = gob.NewEncoder(w)
 29 | 	return enc
 30 | }
 31 | 
 32 | func (enc *LabEncoder) Encode(e interface{}) error {
 33 | 	checkValue(e)
 34 | 	return enc.gob.Encode(e)
 35 | }
 36 | 
 37 | func (enc *LabEncoder) EncodeValue(value reflect.Value) error {
 38 | 	checkValue(value.Interface())
 39 | 	return enc.gob.EncodeValue(value)
 40 | }
 41 | 
 42 | type LabDecoder struct {
 43 | 	gob *gob.Decoder
 44 | }
 45 | 
 46 | func NewDecoder(r io.Reader) *LabDecoder {
 47 | 	dec := &LabDecoder{}
 48 | 	dec.gob = gob.NewDecoder(r)
 49 | 	return dec
 50 | }
 51 | 
 52 | func (dec *LabDecoder) Decode(e interface{}) error {
 53 | 	checkValue(e)
 54 | 	checkDefault(e)
 55 | 	return dec.gob.Decode(e)
 56 | }
 57 | 
 58 | func Register(value interface{}) {
 59 | 	checkValue(value)
 60 | 	gob.Register(value)
 61 | }
 62 | 
 63 | func RegisterName(name string, value interface{}) {
 64 | 	checkValue(value)
 65 | 	gob.RegisterName(name, value)
 66 | }
 67 | 
 68 | func checkValue(value interface{}) {
 69 | 	checkType(reflect.TypeOf(value))
 70 | }
 71 | 
 72 | func checkType(t reflect.Type) {
 73 | 	k := t.Kind()
 74 | 
 75 | 	mu.Lock()
 76 | 	// only complain once, and avoid recursion.
 77 | 	if checked == nil {
 78 | 		checked = map[reflect.Type]bool{}
 79 | 	}
 80 | 	if checked[t] {
 81 | 		mu.Unlock()
 82 | 		return
 83 | 	}
 84 | 	checked[t] = true
 85 | 	mu.Unlock()
 86 | 
 87 | 	switch k {
 88 | 	case reflect.Struct:
 89 | 		for i := 0; i < t.NumField(); i++ {
 90 | 			f := t.Field(i)
 91 | 			rune, _ := utf8.DecodeRuneInString(f.Name)
 92 | 			if unicode.IsUpper(rune) == false {
 93 | 				// ta da
 94 | 				fmt.Printf("labgob error: lower-case field %v of %v in RPC or persist/snapshot will break your Raft\n",
 95 | 					f.Name, t.Name())
 96 | 				mu.Lock()
 97 | 				errorCount += 1
 98 | 				mu.Unlock()
 99 | 			}
100 | 			checkType(f.Type)
101 | 		}
102 | 		return
103 | 	case reflect.Slice, reflect.Array, reflect.Ptr:
104 | 		checkType(t.Elem())
105 | 		return
106 | 	case reflect.Map:
107 | 		checkType(t.Elem())
108 | 		checkType(t.Key())
109 | 		return
110 | 	default:
111 | 		return
112 | 	}
113 | }
114 | 
115 | //
116 | // warn if the value contains non-default values,
117 | // as it would if one sent an RPC but the reply
118 | // struct was already modified. if the RPC reply
119 | // contains default values, GOB won't overwrite
120 | // the non-default value.
121 | //
122 | func checkDefault(value interface{}) {
123 | 	if value == nil {
124 | 		return
125 | 	}
126 | 	checkDefault1(reflect.ValueOf(value), 1, "")
127 | }
128 | 
129 | func checkDefault1(value reflect.Value, depth int, name string) {
130 | 	if depth > 3 {
131 | 		return
132 | 	}
133 | 
134 | 	t := value.Type()
135 | 	k := t.Kind()
136 | 
137 | 	switch k {
138 | 	case reflect.Struct:
139 | 		for i := 0; i < t.NumField(); i++ {
140 | 			vv := value.Field(i)
141 | 			name1 := t.Field(i).Name
142 | 			if name != "" {
143 | 				name1 = name + "." + name1
144 | 			}
145 | 			checkDefault1(vv, depth+1, name1)
146 | 		}
147 | 		return
148 | 	case reflect.Ptr:
149 | 		if value.IsNil() {
150 | 			return
151 | 		}
152 | 		checkDefault1(value.Elem(), depth+1, name)
153 | 		return
154 | 	case reflect.Bool,
155 | 		reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64,
156 | 		reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64,
157 | 		reflect.Uintptr, reflect.Float32, reflect.Float64,
158 | 		reflect.String:
159 | 		if reflect.DeepEqual(reflect.Zero(t).Interface(), value.Interface()) == false {
160 | 			mu.Lock()
161 | 			if errorCount < 1 {
162 | 				what := name
163 | 				if what == "" {
164 | 					what = t.Name()
165 | 				}
166 | 				// this warning typically arises if code re-uses the same RPC reply
167 | 				// variable for multiple RPC calls, or if code restores persisted
168 | 				// state into variable that already have non-default values.
169 | 				fmt.Printf("labgob warning: Decoding into a non-default variable/field %v may not work\n",
170 | 					what)
171 | 			}
172 | 			errorCount += 1
173 | 			mu.Unlock()
174 | 		}
175 | 		return
176 | 	}
177 | }
178 | 


--------------------------------------------------------------------------------
/src/labgob/test_test.go:
--------------------------------------------------------------------------------
  1 | package labgob
  2 | 
  3 | import "testing"
  4 | 
  5 | import "bytes"
  6 | 
  7 | type T1 struct {
  8 | 	T1int0    int
  9 | 	T1int1    int
 10 | 	T1string0 string
 11 | 	T1string1 string
 12 | }
 13 | 
 14 | type T2 struct {
 15 | 	T2slice []T1
 16 | 	T2map   map[int]*T1
 17 | 	T2t3    interface{}
 18 | }
 19 | 
 20 | type T3 struct {
 21 | 	T3int999 int
 22 | }
 23 | 
 24 | //
 25 | // test that we didn't break GOB.
 26 | //
 27 | func TestGOB(t *testing.T) {
 28 | 	e0 := errorCount
 29 | 
 30 | 	w := new(bytes.Buffer)
 31 | 
 32 | 	Register(T3{})
 33 | 
 34 | 	{
 35 | 		x0 := 0
 36 | 		x1 := 1
 37 | 		t1 := T1{}
 38 | 		t1.T1int1 = 1
 39 | 		t1.T1string1 = "6.824"
 40 | 		t2 := T2{}
 41 | 		t2.T2slice = []T1{T1{}, t1}
 42 | 		t2.T2map = map[int]*T1{}
 43 | 		t2.T2map[99] = &T1{1, 2, "x", "y"}
 44 | 		t2.T2t3 = T3{999}
 45 | 
 46 | 		e := NewEncoder(w)
 47 | 		e.Encode(x0)
 48 | 		e.Encode(x1)
 49 | 		e.Encode(t1)
 50 | 		e.Encode(t2)
 51 | 	}
 52 | 	data := w.Bytes()
 53 | 
 54 | 	{
 55 | 		var x0 int
 56 | 		var x1 int
 57 | 		var t1 T1
 58 | 		var t2 T2
 59 | 
 60 | 		r := bytes.NewBuffer(data)
 61 | 		d := NewDecoder(r)
 62 | 		if d.Decode(&x0) != nil ||
 63 | 			d.Decode(&x1) != nil ||
 64 | 			d.Decode(&t1) != nil ||
 65 | 			d.Decode(&t2) != nil {
 66 | 			t.Fatalf("Decode failed")
 67 | 		}
 68 | 
 69 | 		if x0 != 0 {
 70 | 			t.Fatalf("wrong x0 %v\n", x0)
 71 | 		}
 72 | 		if x1 != 1 {
 73 | 			t.Fatalf("wrong x1 %v\n", x1)
 74 | 		}
 75 | 		if t1.T1int0 != 0 {
 76 | 			t.Fatalf("wrong t1.T1int0 %v\n", t1.T1int0)
 77 | 		}
 78 | 		if t1.T1int1 != 1 {
 79 | 			t.Fatalf("wrong t1.T1int1 %v\n", t1.T1int1)
 80 | 		}
 81 | 		if t1.T1string0 != "" {
 82 | 			t.Fatalf("wrong t1.T1string0 %v\n", t1.T1string0)
 83 | 		}
 84 | 		if t1.T1string1 != "6.824" {
 85 | 			t.Fatalf("wrong t1.T1string1 %v\n", t1.T1string1)
 86 | 		}
 87 | 		if len(t2.T2slice) != 2 {
 88 | 			t.Fatalf("wrong t2.T2slice len %v\n", len(t2.T2slice))
 89 | 		}
 90 | 		if t2.T2slice[1].T1int1 != 1 {
 91 | 			t.Fatalf("wrong slice value\n")
 92 | 		}
 93 | 		if len(t2.T2map) != 1 {
 94 | 			t.Fatalf("wrong t2.T2map len %v\n", len(t2.T2map))
 95 | 		}
 96 | 		if t2.T2map[99].T1string1 != "y" {
 97 | 			t.Fatalf("wrong map value\n")
 98 | 		}
 99 | 		t3 := (t2.T2t3).(T3)
100 | 		if t3.T3int999 != 999 {
101 | 			t.Fatalf("wrong t2.T2t3.T3int999\n")
102 | 		}
103 | 	}
104 | 
105 | 	if errorCount != e0 {
106 | 		t.Fatalf("there were errors, but should not have been")
107 | 	}
108 | }
109 | 
110 | type T4 struct {
111 | 	Yes int
112 | 	no  int
113 | }
114 | 
115 | //
116 | // make sure we check capitalization
117 | // labgob prints one warning during this test.
118 | //
119 | func TestCapital(t *testing.T) {
120 | 	e0 := errorCount
121 | 
122 | 	v := []map[*T4]int{}
123 | 
124 | 	w := new(bytes.Buffer)
125 | 	e := NewEncoder(w)
126 | 	e.Encode(v)
127 | 	data := w.Bytes()
128 | 
129 | 	var v1 []map[T4]int
130 | 	r := bytes.NewBuffer(data)
131 | 	d := NewDecoder(r)
132 | 	d.Decode(&v1)
133 | 
134 | 	if errorCount != e0+1 {
135 | 		t.Fatalf("failed to warn about lower-case field")
136 | 	}
137 | }
138 | 
139 | //
140 | // check that we warn when someone sends a default value over
141 | // RPC but the target into which we're decoding holds a non-default
142 | // value, which GOB seems not to overwrite as you'd expect.
143 | //
144 | // labgob does not print a warning.
145 | //
146 | func TestDefault(t *testing.T) {
147 | 	e0 := errorCount
148 | 
149 | 	type DD struct {
150 | 		X int
151 | 	}
152 | 
153 | 	// send a default value...
154 | 	dd1 := DD{}
155 | 
156 | 	w := new(bytes.Buffer)
157 | 	e := NewEncoder(w)
158 | 	e.Encode(dd1)
159 | 	data := w.Bytes()
160 | 
161 | 	// and receive it into memory that already
162 | 	// holds non-default values.
163 | 	reply := DD{99}
164 | 
165 | 	r := bytes.NewBuffer(data)
166 | 	d := NewDecoder(r)
167 | 	d.Decode(&reply)
168 | 
169 | 	if errorCount != e0+1 {
170 | 		t.Fatalf("failed to warn about decoding into non-default value")
171 | 	}
172 | }
173 | 


--------------------------------------------------------------------------------
/src/labrpc/labrpc.go:
--------------------------------------------------------------------------------
  1 | package labrpc
  2 | 
  3 | //
  4 | // channel-based RPC, for 824 labs.
  5 | //
  6 | // simulates a network that can lose requests, lose replies,
  7 | // delay messages, and entirely disconnect particular hosts.
  8 | //
  9 | // we will use the original labrpc.go to test your code for grading.
 10 | // so, while you can modify this code to help you debug, please
 11 | // test against the original before submitting.
 12 | //
 13 | // adapted from Go net/rpc/server.go.
 14 | //
 15 | // sends labgob-encoded values to ensure that RPCs
 16 | // don't include references to program objects.
 17 | //
 18 | // net := MakeNetwork() -- holds network, clients, servers.
 19 | // end := net.MakeEnd(endname) -- create a client end-point, to talk to one server.
 20 | // net.AddServer(servername, server) -- adds a named server to network.
 21 | // net.DeleteServer(servername) -- eliminate the named server.
 22 | // net.Connect(endname, servername) -- connect a client to a server.
 23 | // net.Enable(endname, enabled) -- enable/disable a client.
 24 | // net.Reliable(bool) -- false means drop/delay messages
 25 | //
 26 | // end.Call("Raft.AppendEntries", &args, &reply) -- send an RPC, wait for reply.
 27 | // the "Raft" is the name of the server struct to be called.
 28 | // the "AppendEntries" is the name of the method to be called.
 29 | // Call() returns true to indicate that the server executed the request
 30 | // and the reply is valid.
 31 | // Call() returns false if the network lost the request or reply
 32 | // or the server is down.
 33 | // It is OK to have multiple Call()s in progress at the same time on the
 34 | // same ClientEnd.
 35 | // Concurrent calls to Call() may be delivered to the server out of order,
 36 | // since the network may re-order messages.
 37 | // Call() is guaranteed to return (perhaps after a delay) *except* if the
 38 | // handler function on the server side does not return.
 39 | // the server RPC handler function must declare its args and reply arguments
 40 | // as pointers, so that their types exactly match the types of the arguments
 41 | // to Call().
 42 | //
 43 | // srv := MakeServer()
 44 | // srv.AddService(svc) -- a server can have multiple services, e.g. Raft and k/v
 45 | //   pass srv to net.AddServer()
 46 | //
 47 | // svc := MakeService(receiverObject) -- obj's methods will handle RPCs
 48 | //   much like Go's rpcs.Register()
 49 | //   pass svc to srv.AddService()
 50 | //
 51 | 
 52 | import "Mit6824/src/labgob"
 53 | import "bytes"
 54 | import "reflect"
 55 | import "sync"
 56 | import "log"
 57 | import "strings"
 58 | import "math/rand"
 59 | import "time"
 60 | import "sync/atomic"
 61 | 
 62 | type reqMsg struct {
 63 | 	endname  interface{} // name of sending ClientEnd
 64 | 	svcMeth  string      // e.g. "Raft.AppendEntries"
 65 | 	argsType reflect.Type
 66 | 	args     []byte
 67 | 	replyCh  chan replyMsg
 68 | }
 69 | 
 70 | type replyMsg struct {
 71 | 	ok    bool
 72 | 	reply []byte
 73 | }
 74 | 
 75 | type ClientEnd struct {
 76 | 	endname interface{}   // this end-point's name
 77 | 	ch      chan reqMsg   // copy of Network.endCh
 78 | 	done    chan struct{} // closed when Network is cleaned up
 79 | }
 80 | 
 81 | // send an RPC, wait for the reply.
 82 | // the return value indicates success; false means that
 83 | // no reply was received from the server.
 84 | func (e *ClientEnd) Call(svcMeth string, args interface{}, reply interface{}) bool {
 85 | 	req := reqMsg{}
 86 | 	req.endname = e.endname
 87 | 	req.svcMeth = svcMeth
 88 | 	req.argsType = reflect.TypeOf(args)
 89 | 	req.replyCh = make(chan replyMsg)
 90 | 
 91 | 	qb := new(bytes.Buffer)
 92 | 	qe := labgob.NewEncoder(qb)
 93 | 	qe.Encode(args)
 94 | 	req.args = qb.Bytes()
 95 | 
 96 | 	select {
 97 | 	case e.ch <- req:
 98 | 		// ok
 99 | 	case <-e.done:
100 | 		return false
101 | 	}
102 | 
103 | 	rep := <-req.replyCh
104 | 	if rep.ok {
105 | 		rb := bytes.NewBuffer(rep.reply)
106 | 		rd := labgob.NewDecoder(rb)
107 | 		if err := rd.Decode(reply); err != nil {
108 | 			log.Fatalf("ClientEnd.Call(): decode reply: %v\n", err)
109 | 		}
110 | 		return true
111 | 	} else {
112 | 		return false
113 | 	}
114 | }
115 | 
116 | type Network struct {
117 | 	mu             sync.Mutex
118 | 	reliable       bool
119 | 	longDelays     bool                        // pause a long time on send on disabled connection
120 | 	longReordering bool                        // sometimes delay replies a long time
121 | 	ends           map[interface{}]*ClientEnd  // ends, by name
122 | 	enabled        map[interface{}]bool        // by end name
123 | 	servers        map[interface{}]*Server     // servers, by name
124 | 	connections    map[interface{}]interface{} // endname -> servername
125 | 	endCh          chan reqMsg
126 | 	done           chan struct{} // closed when Network is cleaned up
127 | 	count          int32         // total RPC count, for statistics
128 | }
129 | 
130 | func MakeNetwork() *Network {
131 | 	rn := &Network{}
132 | 	rn.reliable = true
133 | 	rn.ends = map[interface{}]*ClientEnd{}
134 | 	rn.enabled = map[interface{}]bool{}
135 | 	rn.servers = map[interface{}]*Server{}
136 | 	rn.connections = map[interface{}](interface{}){}
137 | 	rn.endCh = make(chan reqMsg)
138 | 	rn.done = make(chan struct{})
139 | 
140 | 	// single goroutine to handle all ClientEnd.Call()s
141 | 	go func() {
142 | 		for {
143 | 			select {
144 | 			case xreq := <-rn.endCh:
145 | 				atomic.AddInt32(&rn.count, 1)
146 | 				go rn.processReq(xreq)
147 | 			case <-rn.done:
148 | 				return
149 | 			}
150 | 		}
151 | 	}()
152 | 
153 | 	return rn
154 | }
155 | 
156 | func (rn *Network) Cleanup() {
157 | 	close(rn.done)
158 | }
159 | 
160 | func (rn *Network) Reliable(yes bool) {
161 | 	rn.mu.Lock()
162 | 	defer rn.mu.Unlock()
163 | 
164 | 	rn.reliable = yes
165 | }
166 | 
167 | func (rn *Network) LongReordering(yes bool) {
168 | 	rn.mu.Lock()
169 | 	defer rn.mu.Unlock()
170 | 
171 | 	rn.longReordering = yes
172 | }
173 | 
174 | func (rn *Network) LongDelays(yes bool) {
175 | 	rn.mu.Lock()
176 | 	defer rn.mu.Unlock()
177 | 
178 | 	rn.longDelays = yes
179 | }
180 | 
181 | func (rn *Network) readEndnameInfo(endname interface{}) (enabled bool,
182 | 	servername interface{}, server *Server, reliable bool, longreordering bool,
183 | ) {
184 | 	rn.mu.Lock()
185 | 	defer rn.mu.Unlock()
186 | 
187 | 	enabled = rn.enabled[endname]
188 | 	servername = rn.connections[endname]
189 | 	if servername != nil {
190 | 		server = rn.servers[servername]
191 | 	}
192 | 	reliable = rn.reliable
193 | 	longreordering = rn.longReordering
194 | 	return
195 | }
196 | 
197 | func (rn *Network) isServerDead(endname interface{}, servername interface{}, server *Server) bool {
198 | 	rn.mu.Lock()
199 | 	defer rn.mu.Unlock()
200 | 
201 | 	if rn.enabled[endname] == false || rn.servers[servername] != server {
202 | 		return true
203 | 	}
204 | 	return false
205 | }
206 | 
207 | func (rn *Network) processReq(req reqMsg) {
208 | 	enabled, servername, server, reliable, longreordering := rn.readEndnameInfo(req.endname)
209 | 
210 | 	if enabled && servername != nil && server != nil {
211 | 		if reliable == false {
212 | 			// short delay
213 | 			ms := (rand.Int() % 27)
214 | 			time.Sleep(time.Duration(ms) * time.Millisecond)
215 | 		}
216 | 
217 | 		if reliable == false && (rand.Int()%1000) < 100 {
218 | 			// drop the request, return as if timeout
219 | 			req.replyCh <- replyMsg{false, nil}
220 | 			return
221 | 		}
222 | 
223 | 		// execute the request (call the RPC handler).
224 | 		// in a separate thread so that we can periodically check
225 | 		// if the server has been killed and the RPC should get a
226 | 		// failure reply.
227 | 		ech := make(chan replyMsg)
228 | 		go func() {
229 | 			r := server.dispatch(req)
230 | 			ech <- r
231 | 		}()
232 | 
233 | 		// wait for handler to return,
234 | 		// but stop waiting if DeleteServer() has been called,
235 | 		// and return an error.
236 | 		var reply replyMsg
237 | 		replyOK := false
238 | 		serverDead := false
239 | 		for replyOK == false && serverDead == false {
240 | 			select {
241 | 			case reply = <-ech:
242 | 				replyOK = true
243 | 			case <-time.After(100 * time.Millisecond):
244 | 				serverDead = rn.isServerDead(req.endname, servername, server)
245 | 				if serverDead {
246 | 					go func() {
247 | 						<-ech // drain channel to let the goroutine created earlier terminate
248 | 					}()
249 | 				}
250 | 			}
251 | 		}
252 | 
253 | 		// do not reply if DeleteServer() has been called, i.e.
254 | 		// the server has been killed. this is needed to avoid
255 | 		// situation in which a client gets a positive reply
256 | 		// to an Append, but the server persisted the update
257 | 		// into the old Persister. config.go is careful to call
258 | 		// DeleteServer() before superseding the Persister.
259 | 		serverDead = rn.isServerDead(req.endname, servername, server)
260 | 
261 | 		if replyOK == false || serverDead == true {
262 | 			// server was killed while we were waiting; return error.
263 | 			req.replyCh <- replyMsg{false, nil}
264 | 		} else if reliable == false && (rand.Int()%1000) < 100 {
265 | 			// drop the reply, return as if timeout
266 | 			req.replyCh <- replyMsg{false, nil}
267 | 		} else if longreordering == true && rand.Intn(900) < 600 {
268 | 			// delay the response for a while
269 | 			ms := 200 + rand.Intn(1+rand.Intn(2000))
270 | 			// Russ points out that this timer arrangement will decrease
271 | 			// the number of goroutines, so that the race
272 | 			// detector is less likely to get upset.
273 | 			time.AfterFunc(time.Duration(ms)*time.Millisecond, func() {
274 | 				req.replyCh <- reply
275 | 			})
276 | 		} else {
277 | 			req.replyCh <- reply
278 | 		}
279 | 	} else {
280 | 		// simulate no reply and eventual timeout.
281 | 		ms := 0
282 | 		if rn.longDelays {
283 | 			// let Raft tests check that leader doesn't send
284 | 			// RPCs synchronously.
285 | 			ms = (rand.Int() % 7000)
286 | 		} else {
287 | 			// many kv tests require the client to try each
288 | 			// server in fairly rapid succession.
289 | 			ms = (rand.Int() % 100)
290 | 		}
291 | 		time.AfterFunc(time.Duration(ms)*time.Millisecond, func() {
292 | 			req.replyCh <- replyMsg{false, nil}
293 | 		})
294 | 	}
295 | 
296 | }
297 | 
298 | // create a client end-point.
299 | // start the thread that listens and delivers.
300 | func (rn *Network) MakeEnd(endname interface{}) *ClientEnd {
301 | 	rn.mu.Lock()
302 | 	defer rn.mu.Unlock()
303 | 
304 | 	if _, ok := rn.ends[endname]; ok {
305 | 		log.Fatalf("MakeEnd: %v already exists\n", endname)
306 | 	}
307 | 
308 | 	e := &ClientEnd{}
309 | 	e.endname = endname
310 | 	e.ch = rn.endCh
311 | 	e.done = rn.done
312 | 	rn.ends[endname] = e
313 | 	rn.enabled[endname] = false
314 | 	rn.connections[endname] = nil
315 | 
316 | 	return e
317 | }
318 | 
319 | func (rn *Network) AddServer(servername interface{}, rs *Server) {
320 | 	rn.mu.Lock()
321 | 	defer rn.mu.Unlock()
322 | 
323 | 	rn.servers[servername] = rs
324 | }
325 | 
326 | func (rn *Network) DeleteServer(servername interface{}) {
327 | 	rn.mu.Lock()
328 | 	defer rn.mu.Unlock()
329 | 
330 | 	rn.servers[servername] = nil
331 | }
332 | 
333 | // connect a ClientEnd to a server.
334 | // a ClientEnd can only be connected once in its lifetime.
335 | func (rn *Network) Connect(endname interface{}, servername interface{}) {
336 | 	rn.mu.Lock()
337 | 	defer rn.mu.Unlock()
338 | 
339 | 	rn.connections[endname] = servername
340 | }
341 | 
342 | // enable/disable a ClientEnd.
343 | func (rn *Network) Enable(endname interface{}, enabled bool) {
344 | 	rn.mu.Lock()
345 | 	defer rn.mu.Unlock()
346 | 
347 | 	rn.enabled[endname] = enabled
348 | }
349 | 
350 | // get a server's count of incoming RPCs.
351 | func (rn *Network) GetCount(servername interface{}) int {
352 | 	rn.mu.Lock()
353 | 	defer rn.mu.Unlock()
354 | 
355 | 	svr := rn.servers[servername]
356 | 	return svr.GetCount()
357 | }
358 | 
359 | func (rn *Network) GetTotalCount() int {
360 | 	x := atomic.LoadInt32(&rn.count)
361 | 	return int(x)
362 | }
363 | 
364 | //
365 | // a server is a collection of services, all sharing
366 | // the same rpc dispatcher. so that e.g. both a Raft
367 | // and a k/v server can listen to the same rpc endpoint.
368 | //
369 | type Server struct {
370 | 	mu       sync.Mutex
371 | 	services map[string]*Service
372 | 	count    int // incoming RPCs
373 | }
374 | 
375 | func MakeServer() *Server {
376 | 	rs := &Server{}
377 | 	rs.services = map[string]*Service{}
378 | 	return rs
379 | }
380 | 
381 | func (rs *Server) AddService(svc *Service) {
382 | 	rs.mu.Lock()
383 | 	defer rs.mu.Unlock()
384 | 	rs.services[svc.name] = svc
385 | }
386 | 
387 | func (rs *Server) dispatch(req reqMsg) replyMsg {
388 | 	rs.mu.Lock()
389 | 
390 | 	rs.count += 1
391 | 
392 | 	// split Raft.AppendEntries into service and method
393 | 	dot := strings.LastIndex(req.svcMeth, ".")
394 | 	serviceName := req.svcMeth[:dot]
395 | 	methodName := req.svcMeth[dot+1:]
396 | 
397 | 	service, ok := rs.services[serviceName]
398 | 
399 | 	rs.mu.Unlock()
400 | 
401 | 	if ok {
402 | 		return service.dispatch(methodName, req)
403 | 	} else {
404 | 		choices := []string{}
405 | 		for k, _ := range rs.services {
406 | 			choices = append(choices, k)
407 | 		}
408 | 		log.Fatalf("labrpc.Server.dispatch(): unknown service %v in %v.%v; expecting one of %v\n",
409 | 			serviceName, serviceName, methodName, choices)
410 | 		return replyMsg{false, nil}
411 | 	}
412 | }
413 | 
414 | func (rs *Server) GetCount() int {
415 | 	rs.mu.Lock()
416 | 	defer rs.mu.Unlock()
417 | 	return rs.count
418 | }
419 | 
420 | // an object with methods that can be called via RPC.
421 | // a single server may have more than one Service.
422 | type Service struct {
423 | 	name    string
424 | 	rcvr    reflect.Value
425 | 	typ     reflect.Type
426 | 	methods map[string]reflect.Method
427 | }
428 | 
429 | func MakeService(rcvr interface{}) *Service {
430 | 	svc := &Service{}
431 | 	svc.typ = reflect.TypeOf(rcvr)
432 | 	svc.rcvr = reflect.ValueOf(rcvr)
433 | 	svc.name = reflect.Indirect(svc.rcvr).Type().Name()
434 | 	svc.methods = map[string]reflect.Method{}
435 | 
436 | 	for m := 0; m < svc.typ.NumMethod(); m++ {
437 | 		method := svc.typ.Method(m)
438 | 		mtype := method.Type
439 | 		mname := method.Name
440 | 
441 | 		//fmt.Printf("%v pp %v ni %v 1k %v 2k %v no %v\n",
442 | 		//	mname, method.PkgPath, mtype.NumIn(), mtype.In(1).Kind(), mtype.In(2).Kind(), mtype.NumOut())
443 | 
444 | 		if method.PkgPath != "" || // capitalized?
445 | 			mtype.NumIn() != 3 ||
446 | 			//mtype.In(1).Kind() != reflect.Ptr ||
447 | 			mtype.In(2).Kind() != reflect.Ptr ||
448 | 			mtype.NumOut() != 0 {
449 | 			// the method is not suitable for a handler
450 | 			//fmt.Printf("bad method: %v\n", mname)
451 | 		} else {
452 | 			// the method looks like a handler
453 | 			svc.methods[mname] = method
454 | 		}
455 | 	}
456 | 
457 | 	return svc
458 | }
459 | 
460 | func (svc *Service) dispatch(methname string, req reqMsg) replyMsg {
461 | 	if method, ok := svc.methods[methname]; ok {
462 | 		// prepare space into which to read the argument.
463 | 		// the Value's type will be a pointer to req.argsType.
464 | 		args := reflect.New(req.argsType)
465 | 
466 | 		// decode the argument.
467 | 		ab := bytes.NewBuffer(req.args)
468 | 		ad := labgob.NewDecoder(ab)
469 | 		ad.Decode(args.Interface())
470 | 
471 | 		// allocate space for the reply.
472 | 		replyType := method.Type.In(2)
473 | 		replyType = replyType.Elem()
474 | 		replyv := reflect.New(replyType)
475 | 
476 | 		// call the method.
477 | 		function := method.Func
478 | 		function.Call([]reflect.Value{svc.rcvr, args.Elem(), replyv})
479 | 
480 | 		// encode the reply.
481 | 		rb := new(bytes.Buffer)
482 | 		re := labgob.NewEncoder(rb)
483 | 		re.EncodeValue(replyv)
484 | 
485 | 		return replyMsg{true, rb.Bytes()}
486 | 	} else {
487 | 		choices := []string{}
488 | 		for k, _ := range svc.methods {
489 | 			choices = append(choices, k)
490 | 		}
491 | 		log.Fatalf("labrpc.Service.dispatch(): unknown method %v in %v; expecting one of %v\n",
492 | 			methname, req.svcMeth, choices)
493 | 		return replyMsg{false, nil}
494 | 	}
495 | }
496 | 


--------------------------------------------------------------------------------
/src/labrpc/test_test.go:
--------------------------------------------------------------------------------
  1 | package labrpc
  2 | 
  3 | import "testing"
  4 | import "strconv"
  5 | import "sync"
  6 | import "runtime"
  7 | import "time"
  8 | import "fmt"
  9 | 
 10 | type JunkArgs struct {
 11 | 	X int
 12 | }
 13 | type JunkReply struct {
 14 | 	X string
 15 | }
 16 | 
 17 | type JunkServer struct {
 18 | 	mu   sync.Mutex
 19 | 	log1 []string
 20 | 	log2 []int
 21 | }
 22 | 
 23 | func (js *JunkServer) Handler1(args string, reply *int) {
 24 | 	js.mu.Lock()
 25 | 	defer js.mu.Unlock()
 26 | 	js.log1 = append(js.log1, args)
 27 | 	*reply, _ = strconv.Atoi(args)
 28 | }
 29 | 
 30 | func (js *JunkServer) Handler2(args int, reply *string) {
 31 | 	js.mu.Lock()
 32 | 	defer js.mu.Unlock()
 33 | 	js.log2 = append(js.log2, args)
 34 | 	*reply = "handler2-" + strconv.Itoa(args)
 35 | }
 36 | 
 37 | func (js *JunkServer) Handler3(args int, reply *int) {
 38 | 	js.mu.Lock()
 39 | 	defer js.mu.Unlock()
 40 | 	time.Sleep(20 * time.Second)
 41 | 	*reply = -args
 42 | }
 43 | 
 44 | // args is a pointer
 45 | func (js *JunkServer) Handler4(args *JunkArgs, reply *JunkReply) {
 46 | 	reply.X = "pointer"
 47 | }
 48 | 
 49 | // args is a not pointer
 50 | func (js *JunkServer) Handler5(args JunkArgs, reply *JunkReply) {
 51 | 	reply.X = "no pointer"
 52 | }
 53 | 
 54 | func TestBasic(t *testing.T) {
 55 | 	runtime.GOMAXPROCS(4)
 56 | 
 57 | 	rn := MakeNetwork()
 58 | 	defer rn.Cleanup()
 59 | 
 60 | 	e := rn.MakeEnd("end1-99")
 61 | 
 62 | 	js := &JunkServer{}
 63 | 	svc := MakeService(js)
 64 | 
 65 | 	rs := MakeServer()
 66 | 	rs.AddService(svc)
 67 | 	rn.AddServer("server99", rs)
 68 | 
 69 | 	rn.Connect("end1-99", "server99")
 70 | 	rn.Enable("end1-99", true)
 71 | 
 72 | 	{
 73 | 		reply := ""
 74 | 		e.Call("JunkServer.Handler2", 111, &reply)
 75 | 		if reply != "handler2-111" {
 76 | 			t.Fatalf("wrong reply from Handler2")
 77 | 		}
 78 | 	}
 79 | 
 80 | 	{
 81 | 		reply := 0
 82 | 		e.Call("JunkServer.Handler1", "9099", &reply)
 83 | 		if reply != 9099 {
 84 | 			t.Fatalf("wrong reply from Handler1")
 85 | 		}
 86 | 	}
 87 | }
 88 | 
 89 | func TestTypes(t *testing.T) {
 90 | 	runtime.GOMAXPROCS(4)
 91 | 
 92 | 	rn := MakeNetwork()
 93 | 	defer rn.Cleanup()
 94 | 
 95 | 	e := rn.MakeEnd("end1-99")
 96 | 
 97 | 	js := &JunkServer{}
 98 | 	svc := MakeService(js)
 99 | 
100 | 	rs := MakeServer()
101 | 	rs.AddService(svc)
102 | 	rn.AddServer("server99", rs)
103 | 
104 | 	rn.Connect("end1-99", "server99")
105 | 	rn.Enable("end1-99", true)
106 | 
107 | 	{
108 | 		var args JunkArgs
109 | 		var reply JunkReply
110 | 		// args must match type (pointer or not) of handler.
111 | 		e.Call("JunkServer.Handler4", &args, &reply)
112 | 		if reply.X != "pointer" {
113 | 			t.Fatalf("wrong reply from Handler4")
114 | 		}
115 | 	}
116 | 
117 | 	{
118 | 		var args JunkArgs
119 | 		var reply JunkReply
120 | 		// args must match type (pointer or not) of handler.
121 | 		e.Call("JunkServer.Handler5", args, &reply)
122 | 		if reply.X != "no pointer" {
123 | 			t.Fatalf("wrong reply from Handler5")
124 | 		}
125 | 	}
126 | }
127 | 
128 | //
129 | // does net.Enable(endname, false) really disconnect a client?
130 | //
131 | func TestDisconnect(t *testing.T) {
132 | 	runtime.GOMAXPROCS(4)
133 | 
134 | 	rn := MakeNetwork()
135 | 	defer rn.Cleanup()
136 | 
137 | 	e := rn.MakeEnd("end1-99")
138 | 
139 | 	js := &JunkServer{}
140 | 	svc := MakeService(js)
141 | 
142 | 	rs := MakeServer()
143 | 	rs.AddService(svc)
144 | 	rn.AddServer("server99", rs)
145 | 
146 | 	rn.Connect("end1-99", "server99")
147 | 
148 | 	{
149 | 		reply := ""
150 | 		e.Call("JunkServer.Handler2", 111, &reply)
151 | 		if reply != "" {
152 | 			t.Fatalf("unexpected reply from Handler2")
153 | 		}
154 | 	}
155 | 
156 | 	rn.Enable("end1-99", true)
157 | 
158 | 	{
159 | 		reply := 0
160 | 		e.Call("JunkServer.Handler1", "9099", &reply)
161 | 		if reply != 9099 {
162 | 			t.Fatalf("wrong reply from Handler1")
163 | 		}
164 | 	}
165 | }
166 | 
167 | //
168 | // test net.GetCount()
169 | //
170 | func TestCounts(t *testing.T) {
171 | 	runtime.GOMAXPROCS(4)
172 | 
173 | 	rn := MakeNetwork()
174 | 	defer rn.Cleanup()
175 | 
176 | 	e := rn.MakeEnd("end1-99")
177 | 
178 | 	js := &JunkServer{}
179 | 	svc := MakeService(js)
180 | 
181 | 	rs := MakeServer()
182 | 	rs.AddService(svc)
183 | 	rn.AddServer(99, rs)
184 | 
185 | 	rn.Connect("end1-99", 99)
186 | 	rn.Enable("end1-99", true)
187 | 
188 | 	for i := 0; i < 17; i++ {
189 | 		reply := ""
190 | 		e.Call("JunkServer.Handler2", i, &reply)
191 | 		wanted := "handler2-" + strconv.Itoa(i)
192 | 		if reply != wanted {
193 | 			t.Fatalf("wrong reply %v from Handler1, expecting %v", reply, wanted)
194 | 		}
195 | 	}
196 | 
197 | 	n := rn.GetCount(99)
198 | 	if n != 17 {
199 | 		t.Fatalf("wrong GetCount() %v, expected 17\n", n)
200 | 	}
201 | }
202 | 
203 | //
204 | // test RPCs from concurrent ClientEnds
205 | //
206 | func TestConcurrentMany(t *testing.T) {
207 | 	runtime.GOMAXPROCS(4)
208 | 
209 | 	rn := MakeNetwork()
210 | 	defer rn.Cleanup()
211 | 
212 | 	js := &JunkServer{}
213 | 	svc := MakeService(js)
214 | 
215 | 	rs := MakeServer()
216 | 	rs.AddService(svc)
217 | 	rn.AddServer(1000, rs)
218 | 
219 | 	ch := make(chan int)
220 | 
221 | 	nclients := 20
222 | 	nrpcs := 10
223 | 	for ii := 0; ii < nclients; ii++ {
224 | 		go func(i int) {
225 | 			n := 0
226 | 			defer func() { ch <- n }()
227 | 
228 | 			e := rn.MakeEnd(i)
229 | 			rn.Connect(i, 1000)
230 | 			rn.Enable(i, true)
231 | 
232 | 			for j := 0; j < nrpcs; j++ {
233 | 				arg := i*100 + j
234 | 				reply := ""
235 | 				e.Call("JunkServer.Handler2", arg, &reply)
236 | 				wanted := "handler2-" + strconv.Itoa(arg)
237 | 				if reply != wanted {
238 | 					t.Fatalf("wrong reply %v from Handler1, expecting %v", reply, wanted)
239 | 				}
240 | 				n += 1
241 | 			}
242 | 		}(ii)
243 | 	}
244 | 
245 | 	total := 0
246 | 	for ii := 0; ii < nclients; ii++ {
247 | 		x := <-ch
248 | 		total += x
249 | 	}
250 | 
251 | 	if total != nclients*nrpcs {
252 | 		t.Fatalf("wrong number of RPCs completed, got %v, expected %v", total, nclients*nrpcs)
253 | 	}
254 | 
255 | 	n := rn.GetCount(1000)
256 | 	if n != total {
257 | 		t.Fatalf("wrong GetCount() %v, expected %v\n", n, total)
258 | 	}
259 | }
260 | 
261 | //
262 | // test unreliable
263 | //
264 | func TestUnreliable(t *testing.T) {
265 | 	runtime.GOMAXPROCS(4)
266 | 
267 | 	rn := MakeNetwork()
268 | 	defer rn.Cleanup()
269 | 	rn.Reliable(false)
270 | 
271 | 	js := &JunkServer{}
272 | 	svc := MakeService(js)
273 | 
274 | 	rs := MakeServer()
275 | 	rs.AddService(svc)
276 | 	rn.AddServer(1000, rs)
277 | 
278 | 	ch := make(chan int)
279 | 
280 | 	nclients := 300
281 | 	for ii := 0; ii < nclients; ii++ {
282 | 		go func(i int) {
283 | 			n := 0
284 | 			defer func() { ch <- n }()
285 | 
286 | 			e := rn.MakeEnd(i)
287 | 			rn.Connect(i, 1000)
288 | 			rn.Enable(i, true)
289 | 
290 | 			arg := i * 100
291 | 			reply := ""
292 | 			ok := e.Call("JunkServer.Handler2", arg, &reply)
293 | 			if ok {
294 | 				wanted := "handler2-" + strconv.Itoa(arg)
295 | 				if reply != wanted {
296 | 					t.Fatalf("wrong reply %v from Handler1, expecting %v", reply, wanted)
297 | 				}
298 | 				n += 1
299 | 			}
300 | 		}(ii)
301 | 	}
302 | 
303 | 	total := 0
304 | 	for ii := 0; ii < nclients; ii++ {
305 | 		x := <-ch
306 | 		total += x
307 | 	}
308 | 
309 | 	if total == nclients || total == 0 {
310 | 		t.Fatalf("all RPCs succeeded despite unreliable")
311 | 	}
312 | }
313 | 
314 | //
315 | // test concurrent RPCs from a single ClientEnd
316 | //
317 | func TestConcurrentOne(t *testing.T) {
318 | 	runtime.GOMAXPROCS(4)
319 | 
320 | 	rn := MakeNetwork()
321 | 	defer rn.Cleanup()
322 | 
323 | 	js := &JunkServer{}
324 | 	svc := MakeService(js)
325 | 
326 | 	rs := MakeServer()
327 | 	rs.AddService(svc)
328 | 	rn.AddServer(1000, rs)
329 | 
330 | 	e := rn.MakeEnd("c")
331 | 	rn.Connect("c", 1000)
332 | 	rn.Enable("c", true)
333 | 
334 | 	ch := make(chan int)
335 | 
336 | 	nrpcs := 20
337 | 	for ii := 0; ii < nrpcs; ii++ {
338 | 		go func(i int) {
339 | 			n := 0
340 | 			defer func() { ch <- n }()
341 | 
342 | 			arg := 100 + i
343 | 			reply := ""
344 | 			e.Call("JunkServer.Handler2", arg, &reply)
345 | 			wanted := "handler2-" + strconv.Itoa(arg)
346 | 			if reply != wanted {
347 | 				t.Fatalf("wrong reply %v from Handler2, expecting %v", reply, wanted)
348 | 			}
349 | 			n += 1
350 | 		}(ii)
351 | 	}
352 | 
353 | 	total := 0
354 | 	for ii := 0; ii < nrpcs; ii++ {
355 | 		x := <-ch
356 | 		total += x
357 | 	}
358 | 
359 | 	if total != nrpcs {
360 | 		t.Fatalf("wrong number of RPCs completed, got %v, expected %v", total, nrpcs)
361 | 	}
362 | 
363 | 	js.mu.Lock()
364 | 	defer js.mu.Unlock()
365 | 	if len(js.log2) != nrpcs {
366 | 		t.Fatalf("wrong number of RPCs delivered")
367 | 	}
368 | 
369 | 	n := rn.GetCount(1000)
370 | 	if n != total {
371 | 		t.Fatalf("wrong GetCount() %v, expected %v\n", n, total)
372 | 	}
373 | }
374 | 
375 | //
376 | // regression: an RPC that's delayed during Enabled=false
377 | // should not delay subsequent RPCs (e.g. after Enabled=true).
378 | //
379 | func TestRegression1(t *testing.T) {
380 | 	runtime.GOMAXPROCS(4)
381 | 
382 | 	rn := MakeNetwork()
383 | 	defer rn.Cleanup()
384 | 
385 | 	js := &JunkServer{}
386 | 	svc := MakeService(js)
387 | 
388 | 	rs := MakeServer()
389 | 	rs.AddService(svc)
390 | 	rn.AddServer(1000, rs)
391 | 
392 | 	e := rn.MakeEnd("c")
393 | 	rn.Connect("c", 1000)
394 | 
395 | 	// start some RPCs while the ClientEnd is disabled.
396 | 	// they'll be delayed.
397 | 	rn.Enable("c", false)
398 | 	ch := make(chan bool)
399 | 	nrpcs := 20
400 | 	for ii := 0; ii < nrpcs; ii++ {
401 | 		go func(i int) {
402 | 			ok := false
403 | 			defer func() { ch <- ok }()
404 | 
405 | 			arg := 100 + i
406 | 			reply := ""
407 | 			// this call ought to return false.
408 | 			e.Call("JunkServer.Handler2", arg, &reply)
409 | 			ok = true
410 | 		}(ii)
411 | 	}
412 | 
413 | 	time.Sleep(100 * time.Millisecond)
414 | 
415 | 	// now enable the ClientEnd and check that an RPC completes quickly.
416 | 	t0 := time.Now()
417 | 	rn.Enable("c", true)
418 | 	{
419 | 		arg := 99
420 | 		reply := ""
421 | 		e.Call("JunkServer.Handler2", arg, &reply)
422 | 		wanted := "handler2-" + strconv.Itoa(arg)
423 | 		if reply != wanted {
424 | 			t.Fatalf("wrong reply %v from Handler2, expecting %v", reply, wanted)
425 | 		}
426 | 	}
427 | 	dur := time.Since(t0).Seconds()
428 | 
429 | 	if dur > 0.03 {
430 | 		t.Fatalf("RPC took too long (%v) after Enable", dur)
431 | 	}
432 | 
433 | 	for ii := 0; ii < nrpcs; ii++ {
434 | 		<-ch
435 | 	}
436 | 
437 | 	js.mu.Lock()
438 | 	defer js.mu.Unlock()
439 | 	if len(js.log2) != 1 {
440 | 		t.Fatalf("wrong number (%v) of RPCs delivered, expected 1", len(js.log2))
441 | 	}
442 | 
443 | 	n := rn.GetCount(1000)
444 | 	if n != 1 {
445 | 		t.Fatalf("wrong GetCount() %v, expected %v\n", n, 1)
446 | 	}
447 | }
448 | 
449 | //
450 | // if an RPC is stuck in a server, and the server
451 | // is killed with DeleteServer(), does the RPC
452 | // get un-stuck?
453 | //
454 | func TestKilled(t *testing.T) {
455 | 	runtime.GOMAXPROCS(4)
456 | 
457 | 	rn := MakeNetwork()
458 | 	defer rn.Cleanup()
459 | 
460 | 	e := rn.MakeEnd("end1-99")
461 | 
462 | 	js := &JunkServer{}
463 | 	svc := MakeService(js)
464 | 
465 | 	rs := MakeServer()
466 | 	rs.AddService(svc)
467 | 	rn.AddServer("server99", rs)
468 | 
469 | 	rn.Connect("end1-99", "server99")
470 | 	rn.Enable("end1-99", true)
471 | 
472 | 	doneCh := make(chan bool)
473 | 	go func() {
474 | 		reply := 0
475 | 		ok := e.Call("JunkServer.Handler3", 99, &reply)
476 | 		doneCh <- ok
477 | 	}()
478 | 
479 | 	time.Sleep(1000 * time.Millisecond)
480 | 
481 | 	select {
482 | 	case <-doneCh:
483 | 		t.Fatalf("Handler3 should not have returned yet")
484 | 	case <-time.After(100 * time.Millisecond):
485 | 	}
486 | 
487 | 	rn.DeleteServer("server99")
488 | 
489 | 	select {
490 | 	case x := <-doneCh:
491 | 		if x != false {
492 | 			t.Fatalf("Handler3 returned successfully despite DeleteServer()")
493 | 		}
494 | 	case <-time.After(100 * time.Millisecond):
495 | 		t.Fatalf("Handler3 should return after DeleteServer()")
496 | 	}
497 | }
498 | 
499 | func TestBenchmark(t *testing.T) {
500 | 	runtime.GOMAXPROCS(4)
501 | 
502 | 	rn := MakeNetwork()
503 | 	defer rn.Cleanup()
504 | 
505 | 	e := rn.MakeEnd("end1-99")
506 | 
507 | 	js := &JunkServer{}
508 | 	svc := MakeService(js)
509 | 
510 | 	rs := MakeServer()
511 | 	rs.AddService(svc)
512 | 	rn.AddServer("server99", rs)
513 | 
514 | 	rn.Connect("end1-99", "server99")
515 | 	rn.Enable("end1-99", true)
516 | 
517 | 	t0 := time.Now()
518 | 	n := 100000
519 | 	for iters := 0; iters < n; iters++ {
520 | 		reply := ""
521 | 		e.Call("JunkServer.Handler2", 111, &reply)
522 | 		if reply != "handler2-111" {
523 | 			t.Fatalf("wrong reply from Handler2")
524 | 		}
525 | 	}
526 | 	fmt.Printf("%v for %v\n", time.Since(t0), n)
527 | 	// march 2016, rtm laptop, 22 microseconds per RPC
528 | }
529 | 


--------------------------------------------------------------------------------
/src/linearizability/bitset.go:
--------------------------------------------------------------------------------
 1 | package linearizability
 2 | 
 3 | type bitset []uint64
 4 | 
 5 | // data layout:
 6 | // bits 0-63 are in data[0], the next are in data[1], etc.
 7 | 
 8 | func newBitset(bits uint) bitset {
 9 | 	extra := uint(0)
10 | 	if bits%64 != 0 {
11 | 		extra = 1
12 | 	}
13 | 	chunks := bits/64 + extra
14 | 	return bitset(make([]uint64, chunks))
15 | }
16 | 
17 | func (b bitset) clone() bitset {
18 | 	dataCopy := make([]uint64, len(b))
19 | 	copy(dataCopy, b)
20 | 	return bitset(dataCopy)
21 | }
22 | 
23 | func bitsetIndex(pos uint) (uint, uint) {
24 | 	return pos / 64, pos % 64
25 | }
26 | 
27 | func (b bitset) set(pos uint) bitset {
28 | 	major, minor := bitsetIndex(pos)
29 | 	b[major] |= (1 << minor)
30 | 	return b
31 | }
32 | 
33 | func (b bitset) clear(pos uint) bitset {
34 | 	major, minor := bitsetIndex(pos)
35 | 	b[major] &^= (1 << minor)
36 | 	return b
37 | }
38 | 
39 | func (b bitset) get(pos uint) bool {
40 | 	major, minor := bitsetIndex(pos)
41 | 	return b[major]&(1<<minor) != 0
42 | }
43 | 
44 | func (b bitset) popcnt() uint {
45 | 	total := uint(0)
46 | 	for _, v := range b {
47 | 		v = (v & 0x5555555555555555) + ((v & 0xAAAAAAAAAAAAAAAA) >> 1)
48 | 		v = (v & 0x3333333333333333) + ((v & 0xCCCCCCCCCCCCCCCC) >> 2)
49 | 		v = (v & 0x0F0F0F0F0F0F0F0F) + ((v & 0xF0F0F0F0F0F0F0F0) >> 4)
50 | 		v *= 0x0101010101010101
51 | 		total += uint((v >> 56) & 0xFF)
52 | 	}
53 | 	return total
54 | }
55 | 
56 | func (b bitset) hash() uint64 {
57 | 	hash := uint64(b.popcnt())
58 | 	for _, v := range b {
59 | 		hash ^= v
60 | 	}
61 | 	return hash
62 | }
63 | 
64 | func (b bitset) equals(b2 bitset) bool {
65 | 	if len(b) != len(b2) {
66 | 		return false
67 | 	}
68 | 	for i := range b {
69 | 		if b[i] != b2[i] {
70 | 			return false
71 | 		}
72 | 	}
73 | 	return true
74 | }
75 | 


--------------------------------------------------------------------------------
/src/linearizability/linearizability.go:
--------------------------------------------------------------------------------
  1 | package linearizability
  2 | 
  3 | import (
  4 | 	"sort"
  5 | 	"sync/atomic"
  6 | 	"time"
  7 | )
  8 | 
  9 | type entryKind bool
 10 | 
 11 | const (
 12 | 	callEntry   entryKind = false
 13 | 	returnEntry           = true
 14 | )
 15 | 
 16 | type entry struct {
 17 | 	kind  entryKind
 18 | 	value interface{}
 19 | 	id    uint
 20 | 	time  int64
 21 | }
 22 | 
 23 | type byTime []entry
 24 | 
 25 | func (a byTime) Len() int {
 26 | 	return len(a)
 27 | }
 28 | 
 29 | func (a byTime) Swap(i, j int) {
 30 | 	a[i], a[j] = a[j], a[i]
 31 | }
 32 | 
 33 | func (a byTime) Less(i, j int) bool {
 34 | 	return a[i].time < a[j].time
 35 | }
 36 | 
 37 | func makeEntries(history []Operation) []entry {
 38 | 	var entries []entry = nil
 39 | 	id := uint(0)
 40 | 	for _, elem := range history {
 41 | 		entries = append(entries, entry{
 42 | 			callEntry, elem.Input, id, elem.Call})
 43 | 		entries = append(entries, entry{
 44 | 			returnEntry, elem.Output, id, elem.Return})
 45 | 		id++
 46 | 	}
 47 | 	sort.Sort(byTime(entries))
 48 | 	return entries
 49 | }
 50 | 
 51 | type node struct {
 52 | 	value interface{}
 53 | 	match *node // call if match is nil, otherwise return
 54 | 	id    uint
 55 | 	next  *node
 56 | 	prev  *node
 57 | }
 58 | 
 59 | func insertBefore(n *node, mark *node) *node {
 60 | 	if mark != nil {
 61 | 		beforeMark := mark.prev
 62 | 		mark.prev = n
 63 | 		n.next = mark
 64 | 		if beforeMark != nil {
 65 | 			n.prev = beforeMark
 66 | 			beforeMark.next = n
 67 | 		}
 68 | 	}
 69 | 	return n
 70 | }
 71 | 
 72 | func length(n *node) uint {
 73 | 	l := uint(0)
 74 | 	for n != nil {
 75 | 		n = n.next
 76 | 		l++
 77 | 	}
 78 | 	return l
 79 | }
 80 | 
 81 | func renumber(events []Event) []Event {
 82 | 	var e []Event
 83 | 	m := make(map[uint]uint) // renumbering
 84 | 	id := uint(0)
 85 | 	for _, v := range events {
 86 | 		if r, ok := m[v.Id]; ok {
 87 | 			e = append(e, Event{v.Kind, v.Value, r})
 88 | 		} else {
 89 | 			e = append(e, Event{v.Kind, v.Value, id})
 90 | 			m[v.Id] = id
 91 | 			id++
 92 | 		}
 93 | 	}
 94 | 	return e
 95 | }
 96 | 
 97 | func convertEntries(events []Event) []entry {
 98 | 	var entries []entry
 99 | 	for _, elem := range events {
100 | 		kind := callEntry
101 | 		if elem.Kind == ReturnEvent {
102 | 			kind = returnEntry
103 | 		}
104 | 		entries = append(entries, entry{kind, elem.Value, elem.Id, -1})
105 | 	}
106 | 	return entries
107 | }
108 | 
109 | func makeLinkedEntries(entries []entry) *node {
110 | 	var root *node = nil
111 | 	match := make(map[uint]*node)
112 | 	for i := len(entries) - 1; i >= 0; i-- {
113 | 		elem := entries[i]
114 | 		if elem.kind == returnEntry {
115 | 			entry := &node{value: elem.value, match: nil, id: elem.id}
116 | 			match[elem.id] = entry
117 | 			insertBefore(entry, root)
118 | 			root = entry
119 | 		} else {
120 | 			entry := &node{value: elem.value, match: match[elem.id], id: elem.id}
121 | 			insertBefore(entry, root)
122 | 			root = entry
123 | 		}
124 | 	}
125 | 	return root
126 | }
127 | 
128 | type cacheEntry struct {
129 | 	linearized bitset
130 | 	state      interface{}
131 | }
132 | 
133 | func cacheContains(model Model, cache map[uint64][]cacheEntry, entry cacheEntry) bool {
134 | 	for _, elem := range cache[entry.linearized.hash()] {
135 | 		if entry.linearized.equals(elem.linearized) && model.Equal(entry.state, elem.state) {
136 | 			return true
137 | 		}
138 | 	}
139 | 	return false
140 | }
141 | 
142 | type callsEntry struct {
143 | 	entry *node
144 | 	state interface{}
145 | }
146 | 
147 | func lift(entry *node) {
148 | 	entry.prev.next = entry.next
149 | 	entry.next.prev = entry.prev
150 | 	match := entry.match
151 | 	match.prev.next = match.next
152 | 	if match.next != nil {
153 | 		match.next.prev = match.prev
154 | 	}
155 | }
156 | 
157 | func unlift(entry *node) {
158 | 	match := entry.match
159 | 	match.prev.next = match
160 | 	if match.next != nil {
161 | 		match.next.prev = match
162 | 	}
163 | 	entry.prev.next = entry
164 | 	entry.next.prev = entry
165 | }
166 | 
167 | func checkSingle(model Model, subhistory *node, kill *int32) bool {
168 | 	n := length(subhistory) / 2
169 | 	linearized := newBitset(n)
170 | 	cache := make(map[uint64][]cacheEntry) // map from hash to cache entry
171 | 	var calls []callsEntry
172 | 
173 | 	state := model.Init()
174 | 	headEntry := insertBefore(&node{value: nil, match: nil, id: ^uint(0)}, subhistory)
175 | 	entry := subhistory
176 | 	for headEntry.next != nil {
177 | 		if atomic.LoadInt32(kill) != 0 {
178 | 			return false
179 | 		}
180 | 		if entry.match != nil {
181 | 			matching := entry.match // the return entry
182 | 			ok, newState := model.Step(state, entry.value, matching.value)
183 | 			if ok {
184 | 				newLinearized := linearized.clone().set(entry.id)
185 | 				newCacheEntry := cacheEntry{newLinearized, newState}
186 | 				if !cacheContains(model, cache, newCacheEntry) {
187 | 					hash := newLinearized.hash()
188 | 					cache[hash] = append(cache[hash], newCacheEntry)
189 | 					calls = append(calls, callsEntry{entry, state})
190 | 					state = newState
191 | 					linearized.set(entry.id)
192 | 					lift(entry)
193 | 					entry = headEntry.next
194 | 				} else {
195 | 					entry = entry.next
196 | 				}
197 | 			} else {
198 | 				entry = entry.next
199 | 			}
200 | 		} else {
201 | 			if len(calls) == 0 {
202 | 				return false
203 | 			}
204 | 			callsTop := calls[len(calls)-1]
205 | 			entry = callsTop.entry
206 | 			state = callsTop.state
207 | 			linearized.clear(entry.id)
208 | 			calls = calls[:len(calls)-1]
209 | 			unlift(entry)
210 | 			entry = entry.next
211 | 		}
212 | 	}
213 | 	return true
214 | }
215 | 
216 | func fillDefault(model Model) Model {
217 | 	if model.Partition == nil {
218 | 		model.Partition = NoPartition
219 | 	}
220 | 	if model.PartitionEvent == nil {
221 | 		model.PartitionEvent = NoPartitionEvent
222 | 	}
223 | 	if model.Equal == nil {
224 | 		model.Equal = ShallowEqual
225 | 	}
226 | 	return model
227 | }
228 | 
229 | func CheckOperations(model Model, history []Operation) bool {
230 | 	return CheckOperationsTimeout(model, history, 0)
231 | }
232 | 
233 | // timeout = 0 means no timeout
234 | // if this operation times out, then a false positive is possible
235 | func CheckOperationsTimeout(model Model, history []Operation, timeout time.Duration) bool {
236 | 	model = fillDefault(model)
237 | 	partitions := model.Partition(history)
238 | 	ok := true
239 | 	results := make(chan bool)
240 | 	kill := int32(0)
241 | 	for _, subhistory := range partitions {
242 | 		l := makeLinkedEntries(makeEntries(subhistory))
243 | 		go func() {
244 | 			results <- checkSingle(model, l, &kill)
245 | 		}()
246 | 	}
247 | 	var timeoutChan <-chan time.Time
248 | 	if timeout > 0 {
249 | 		timeoutChan = time.After(timeout)
250 | 	}
251 | 	count := 0
252 | loop:
253 | 	for {
254 | 		select {
255 | 		case result := <-results:
256 | 			ok = ok && result
257 | 			if !ok {
258 | 				atomic.StoreInt32(&kill, 1)
259 | 				break loop
260 | 			}
261 | 			count++
262 | 			if count >= len(partitions) {
263 | 				break loop
264 | 			}
265 | 		case <-timeoutChan:
266 | 			break loop // if we time out, we might get a false positive
267 | 		}
268 | 	}
269 | 	return ok
270 | }
271 | 
272 | func CheckEvents(model Model, history []Event) bool {
273 | 	return CheckEventsTimeout(model, history, 0)
274 | }
275 | 
276 | // timeout = 0 means no timeout
277 | // if this operation times out, then a false positive is possible
278 | func CheckEventsTimeout(model Model, history []Event, timeout time.Duration) bool {
279 | 	model = fillDefault(model)
280 | 	partitions := model.PartitionEvent(history)
281 | 	ok := true
282 | 	results := make(chan bool)
283 | 	kill := int32(0)
284 | 	for _, subhistory := range partitions {
285 | 		l := makeLinkedEntries(convertEntries(renumber(subhistory)))
286 | 		go func() {
287 | 			results <- checkSingle(model, l, &kill)
288 | 		}()
289 | 	}
290 | 	var timeoutChan <-chan time.Time
291 | 	if timeout > 0 {
292 | 		timeoutChan = time.After(timeout)
293 | 	}
294 | 	count := 0
295 | loop:
296 | 	for {
297 | 		select {
298 | 		case result := <-results:
299 | 			ok = ok && result
300 | 			if !ok {
301 | 				atomic.StoreInt32(&kill, 1)
302 | 				break loop
303 | 			}
304 | 			count++
305 | 			if count >= len(partitions) {
306 | 				break loop
307 | 			}
308 | 		case <-timeoutChan:
309 | 			break loop // if we time out, we might get a false positive
310 | 		}
311 | 	}
312 | 	return ok
313 | }
314 | 


--------------------------------------------------------------------------------
/src/linearizability/model.go:
--------------------------------------------------------------------------------
 1 | package linearizability
 2 | 
 3 | type Operation struct {
 4 | 	Input  interface{}
 5 | 	Call   int64 // invocation time
 6 | 	Output interface{}
 7 | 	Return int64 // response time
 8 | }
 9 | 
10 | type EventKind bool
11 | 
12 | const (
13 | 	CallEvent   EventKind = false
14 | 	ReturnEvent EventKind = true
15 | )
16 | 
17 | type Event struct {
18 | 	Kind  EventKind
19 | 	Value interface{}
20 | 	Id    uint
21 | }
22 | 
23 | type Model struct {
24 | 	// Partition functions, such that a history is linearizable if an only
25 | 	// if each partition is linearizable. If you don't want to implement
26 | 	// this, you can always use the `NoPartition` functions implemented
27 | 	// below.
28 | 	Partition      func(history []Operation) [][]Operation
29 | 	PartitionEvent func(history []Event) [][]Event
30 | 	// Initial state of the system.
31 | 	Init func() interface{}
32 | 	// Step function for the system. Returns whether or not the system
33 | 	// could take this step with the given inputs and outputs and also
34 | 	// returns the new state. This should not mutate the existing state.
35 | 	Step func(state interface{}, input interface{}, output interface{}) (bool, interface{})
36 | 	// Equality on states. If you are using a simple data type for states,
37 | 	// you can use the `ShallowEqual` function implemented below.
38 | 	Equal func(state1, state2 interface{}) bool
39 | }
40 | 
41 | func NoPartition(history []Operation) [][]Operation {
42 | 	return [][]Operation{history}
43 | }
44 | 
45 | func NoPartitionEvent(history []Event) [][]Event {
46 | 	return [][]Event{history}
47 | }
48 | 
49 | func ShallowEqual(state1, state2 interface{}) bool {
50 | 	return state1 == state2
51 | }
52 | 


--------------------------------------------------------------------------------
/src/linearizability/models.go:
--------------------------------------------------------------------------------
 1 | package linearizability
 2 | 
 3 | // kv model
 4 | 
 5 | type KvInput struct {
 6 | 	Op    uint8 // 0 => get, 1 => put, 2 => append
 7 | 	Key   string
 8 | 	Value string
 9 | }
10 | 
11 | type KvOutput struct {
12 | 	Value string
13 | }
14 | 
15 | func KvModel() Model {
16 | 	return Model{
17 | 		Partition: func(history []Operation) [][]Operation {
18 | 			m := make(map[string][]Operation)
19 | 			for _, v := range history {
20 | 				key := v.Input.(KvInput).Key
21 | 				m[key] = append(m[key], v)
22 | 			}
23 | 			var ret [][]Operation
24 | 			for _, v := range m {
25 | 				ret = append(ret, v)
26 | 			}
27 | 			return ret
28 | 		},
29 | 		Init: func() interface{} {
30 | 			// note: we are modeling a single key's value here;
31 | 			// we're partitioning by key, so this is okay
32 | 			return ""
33 | 		},
34 | 		Step: func(state, input, output interface{}) (bool, interface{}) {
35 | 			inp := input.(KvInput)
36 | 			out := output.(KvOutput)
37 | 			st := state.(string)
38 | 			if inp.Op == 0 {
39 | 				// get
40 | 				return out.Value == st, state
41 | 			} else if inp.Op == 1 {
42 | 				// put
43 | 				return true, inp.Value
44 | 			} else {
45 | 				// append
46 | 				return true, (st + inp.Value)
47 | 			}
48 | 		},
49 | 		Equal: ShallowEqual,
50 | 	}
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/diskvd.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // start a diskvd server. it's a member of some replica
 5 | // group, which has other members, and it needs to know
 6 | // how to talk to the members of the shardmaster service.
 7 | // used by ../diskv/test_test.go
 8 | //
 9 | // arguments:
10 | //   -g groupid
11 | //   -m masterport1 -m masterport2 ...
12 | //   -s replicaport1 -s replicaport2 ...
13 | //   -i my-index-in-server-port-list
14 | //   -u unreliable
15 | //   -d directory
16 | //   -r restart
17 | 
18 | import "time"
19 | import "diskv"
20 | import "os"
21 | import "fmt"
22 | import "strconv"
23 | import "runtime"
24 | 
25 | func usage() {
26 | 	fmt.Printf("Usage: diskvd -g gid -m master... -s server... -i my-index -d dir\n")
27 | 	os.Exit(1)
28 | }
29 | 
30 | func main() {
31 | 	var gid int64 = -1     // my replica group ID
32 | 	masters := []string{}  // ports of shardmasters
33 | 	replicas := []string{} // ports of servers in my replica group
34 | 	me := -1               // my index in replicas[]
35 | 	unreliable := false
36 | 	dir := "" // store persistent data here
37 | 	restart := false
38 | 
39 | 	for i := 1; i+1 < len(os.Args); i += 2 {
40 | 		a0 := os.Args[i]
41 | 		a1 := os.Args[i+1]
42 | 		if a0 == "-g" {
43 | 			gid, _ = strconv.ParseInt(a1, 10, 64)
44 | 		} else if a0 == "-m" {
45 | 			masters = append(masters, a1)
46 | 		} else if a0 == "-s" {
47 | 			replicas = append(replicas, a1)
48 | 		} else if a0 == "-i" {
49 | 			me, _ = strconv.Atoi(a1)
50 | 		} else if a0 == "-u" {
51 | 			unreliable, _ = strconv.ParseBool(a1)
52 | 		} else if a0 == "-d" {
53 | 			dir = a1
54 | 		} else if a0 == "-r" {
55 | 			restart, _ = strconv.ParseBool(a1)
56 | 		} else {
57 | 			usage()
58 | 		}
59 | 	}
60 | 
61 | 	if gid < 0 || me < 0 || len(masters) < 1 || me >= len(replicas) || dir == "" {
62 | 		usage()
63 | 	}
64 | 
65 | 	runtime.GOMAXPROCS(4)
66 | 
67 | 	srv := diskv.StartServer(gid, masters, replicas, me, dir, restart)
68 | 	srv.Setunreliable(unreliable)
69 | 
70 | 	// for safety, force quit after 10 minutes.
71 | 	time.Sleep(10 * 60 * time.Second)
72 | 	mep, _ := os.FindProcess(os.Getpid())
73 | 	mep.Kill()
74 | }
75 | 


--------------------------------------------------------------------------------
/src/main/lockc.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // see comments in lockd.go
 5 | //
 6 | 
 7 | import "lockservice"
 8 | import "os"
 9 | import "fmt"
10 | 
11 | func usage() {
12 | 	fmt.Printf("Usage: lockc -l|-u primaryport backupport lockname\n")
13 | 	os.Exit(1)
14 | }
15 | 
16 | func main() {
17 | 	if len(os.Args) == 5 {
18 | 		ck := lockservice.MakeClerk(os.Args[2], os.Args[3])
19 | 		var ok bool
20 | 		if os.Args[1] == "-l" {
21 | 			ok = ck.Lock(os.Args[4])
22 | 		} else if os.Args[1] == "-u" {
23 | 			ok = ck.Unlock(os.Args[4])
24 | 		} else {
25 | 			usage()
26 | 		}
27 | 		fmt.Printf("reply: %v\n", ok)
28 | 	} else {
29 | 		usage()
30 | 	}
31 | }
32 | 


--------------------------------------------------------------------------------
/src/main/lockd.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | // export GOPATH=~/6.824
 4 | // go build lockd.go
 5 | // go build lockc.go
 6 | // ./lockd -p a b &
 7 | // ./lockd -b a b &
 8 | // ./lockc -l a b lx
 9 | // ./lockc -u a b lx
10 | //
11 | // on Athena, use /tmp/myname-a and /tmp/myname-b
12 | // instead of a and b.
13 | 
14 | import "time"
15 | import "lockservice"
16 | import "os"
17 | import "fmt"
18 | 
19 | func main() {
20 | 	if len(os.Args) == 4 && os.Args[1] == "-p" {
21 | 		lockservice.StartServer(os.Args[2], os.Args[3], true)
22 | 	} else if len(os.Args) == 4 && os.Args[1] == "-b" {
23 | 		lockservice.StartServer(os.Args[2], os.Args[3], false)
24 | 	} else {
25 | 		fmt.Printf("Usage: lockd -p|-b primaryport backupport\n")
26 | 		os.Exit(1)
27 | 	}
28 | 	for {
29 | 		time.Sleep(100 * time.Second)
30 | 	}
31 | }
32 | 


--------------------------------------------------------------------------------
/src/main/mrmaster.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // start the master process, which is implemented
 5 | // in ../mr/master.go
 6 | //
 7 | // go run mrmaster.go pg*.txt
 8 | //
 9 | 
10 | import "Mit6824/src/mr"
11 | import "time"
12 | import "os"
13 | import "fmt"
14 | 
15 | func main() {
16 | 	if len(os.Args) < 2 {
17 | 		fmt.Fprintf(os.Stderr, "Usage: mrmaster inputfiles...\n")
18 | 		os.Exit(1)
19 | 	}
20 | 
21 | 	m := mr.MakeMaster(os.Args[1:], 10)
22 | 	for m.Done() == false {
23 | 		time.Sleep(time.Second)
24 | 	}
25 | 
26 | 	time.Sleep(time.Second)
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/mrsequential.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | //
  4 | // simple sequential MapReduce.
  5 | //
  6 | // go run mrsequential.go ../mrapps/wc.so pg*.txt
  7 | //
  8 | 
  9 | import "fmt"
 10 | import "Mit6824/src/mr"
 11 | import "plugin"
 12 | import "os"
 13 | import "log"
 14 | import "io/ioutil"
 15 | import "sort"
 16 | 
 17 | // for sorting by key.
 18 | type ByKey []mr.KeyValue
 19 | 
 20 | // for sorting by key.
 21 | func (a ByKey) Len() int           { return len(a) }
 22 | func (a ByKey) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
 23 | func (a ByKey) Less(i, j int) bool { return a[i].Key < a[j].Key }
 24 | 
 25 | func main() {
 26 | 	if len(os.Args) < 3 {
 27 | 		fmt.Fprintf(os.Stderr, "Usage: mrsequential ../mrapps/xxx.so inputfiles...\n")
 28 | 		os.Exit(1)
 29 | 	}
 30 | 
 31 | 	mapf, reducef := loadPlugin(os.Args[1])
 32 | 
 33 | 	//
 34 | 	// read each input file,
 35 | 	// pass it to Map,
 36 | 	// accumulate the intermediate Map output.
 37 | 	//
 38 | 	intermediate := []mr.KeyValue{}
 39 | 	for _, filename := range os.Args[2:] {
 40 | 		file, err := os.Open(filename)
 41 | 		if err != nil {
 42 | 			log.Fatalf("cannot open %v", filename)
 43 | 		}
 44 | 		content, err := ioutil.ReadAll(file)
 45 | 		if err != nil {
 46 | 			log.Fatalf("cannot read %v", filename)
 47 | 		}
 48 | 		file.Close()
 49 | 		kva := mapf(filename, string(content))
 50 | 		intermediate = append(intermediate, kva...)
 51 | 	}
 52 | 
 53 | 	//
 54 | 	// a big difference from real MapReduce is that all the
 55 | 	// intermediate data is in one place, intermediate[],
 56 | 	// rather than being partitioned into NxM buckets.
 57 | 	//
 58 | 
 59 | 	sort.Sort(ByKey(intermediate))
 60 | 
 61 | 	oname := "mr-out-0"
 62 | 	ofile, _ := os.Create(oname)
 63 | 
 64 | 	//
 65 | 	// call Reduce on each distinct key in intermediate[],
 66 | 	// and print the result to mr-out-0.
 67 | 	//
 68 | 	i := 0
 69 | 	for i < len(intermediate) {
 70 | 		j := i + 1
 71 | 		for j < len(intermediate) && intermediate[j].Key == intermediate[i].Key {
 72 | 			j++
 73 | 		}
 74 | 		values := []string{}
 75 | 		for k := i; k < j; k++ {
 76 | 			values = append(values, intermediate[k].Value)
 77 | 		}
 78 | 		output := reducef(intermediate[i].Key, values)
 79 | 
 80 | 		// this is the correct format for each line of Reduce output.
 81 | 		fmt.Fprintf(ofile, "%v %v\n", intermediate[i].Key, output)
 82 | 
 83 | 		i = j
 84 | 	}
 85 | 
 86 | 	ofile.Close()
 87 | }
 88 | 
 89 | //
 90 | // load the application Map and Reduce functions
 91 | // from a plugin file, e.g. ../mrapps/wc.so
 92 | //
 93 | func loadPlugin(filename string) (func(string, string) []mr.KeyValue, func(string, []string) string) {
 94 | 	p, err := plugin.Open(filename)
 95 | 	if err != nil {
 96 | 		log.Fatalf("cannot load plugin %v", filename)
 97 | 	}
 98 | 	xmapf, err := p.Lookup("Map")
 99 | 	if err != nil {
100 | 		log.Fatalf("cannot find Map in %v", filename)
101 | 	}
102 | 	mapf := xmapf.(func(string, string) []mr.KeyValue)
103 | 	xreducef, err := p.Lookup("Reduce")
104 | 	if err != nil {
105 | 		log.Fatalf("cannot find Reduce in %v", filename)
106 | 	}
107 | 	reducef := xreducef.(func(string, []string) string)
108 | 
109 | 	return mapf, reducef
110 | }
111 | 


--------------------------------------------------------------------------------
/src/main/mrworker.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // start a worker process, which is implemented
 5 | // in ../mr/worker.go. typically there will be
 6 | // multiple worker processes, talking to one master.
 7 | //
 8 | // go run mrworker.go ../mrapps/wc.so
 9 | //
10 | 
11 | import "Mit6824/src/mr"
12 | import "plugin"
13 | import "os"
14 | import "fmt"
15 | import "log"
16 | 
17 | func main() {
18 | 	if len(os.Args) != 2 {
19 | 		fmt.Fprintf(os.Stderr, "Usage: mrworker ../mrapps/xxx.so\n")
20 | 		os.Exit(1)
21 | 	}
22 | 
23 | 	mapf, reducef := loadPlugin(os.Args[1])
24 | 
25 | 	mr.Worker(mapf, reducef)
26 | }
27 | 
28 | //
29 | // load the application Map and Reduce functions
30 | // from a plugin file, e.g. ../mrapps/wc.so
31 | //
32 | func loadPlugin(filename string) (func(string, string) []mr.KeyValue, func(string, []string) string) {
33 | 	p, err := plugin.Open(filename)
34 | 	if err != nil {
35 | 		log.Fatalf("cannot load plugin %v", filename)
36 | 	}
37 | 	xmapf, err := p.Lookup("Map")
38 | 	if err != nil {
39 | 		log.Fatalf("cannot find Map in %v", filename)
40 | 	}
41 | 	mapf := xmapf.(func(string, string) []mr.KeyValue)
42 | 	xreducef, err := p.Lookup("Reduce")
43 | 	if err != nil {
44 | 		log.Fatalf("cannot find Reduce in %v", filename)
45 | 	}
46 | 	reducef := xreducef.(func(string, []string) string)
47 | 
48 | 	return mapf, reducef
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/pbc.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // pbservice client application
 5 | //
 6 | // export GOPATH=~/6.824
 7 | // go build viewd.go
 8 | // go build pbd.go
 9 | // go build pbc.go
10 | // ./viewd /tmp/rtm-v &
11 | // ./pbd /tmp/rtm-v /tmp/rtm-1 &
12 | // ./pbd /tmp/rtm-v /tmp/rtm-2 &
13 | // ./pbc /tmp/rtm-v key1 value1
14 | // ./pbc /tmp/rtm-v key1
15 | //
16 | // change "rtm" to your user name.
17 | // start the pbd programs in separate windows and kill
18 | // and restart them to exercise fault tolerance.
19 | //
20 | 
21 | import "pbservice"
22 | import "os"
23 | import "fmt"
24 | 
25 | func usage() {
26 | 	fmt.Printf("Usage: pbc viewport key\n")
27 | 	fmt.Printf("       pbc viewport key value\n")
28 | 	os.Exit(1)
29 | }
30 | 
31 | func main() {
32 | 	if len(os.Args) == 3 {
33 | 		// get
34 | 		ck := pbservice.MakeClerk(os.Args[1], "")
35 | 		v := ck.Get(os.Args[2])
36 | 		fmt.Printf("%v\n", v)
37 | 	} else if len(os.Args) == 4 {
38 | 		// put
39 | 		ck := pbservice.MakeClerk(os.Args[1], "")
40 | 		ck.Put(os.Args[2], os.Args[3])
41 | 	} else {
42 | 		usage()
43 | 	}
44 | }
45 | 


--------------------------------------------------------------------------------
/src/main/pbd.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // see directions in pbc.go
 5 | //
 6 | 
 7 | import "time"
 8 | import "pbservice"
 9 | import "os"
10 | import "fmt"
11 | 
12 | func main() {
13 | 	if len(os.Args) != 3 {
14 | 		fmt.Printf("Usage: pbd viewport myport\n")
15 | 		os.Exit(1)
16 | 	}
17 | 
18 | 	pbservice.StartServer(os.Args[1], os.Args[2])
19 | 
20 | 	for {
21 | 		time.Sleep(100 * time.Second)
22 | 	}
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/test-mr.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | 
  3 | # 
  4 | # basic map-reduce test
  5 | #
  6 | 
  7 | RACE= 
  8 | 
  9 | # uncomment this to run the tests with the Go race detector.
 10 | #RACE=-race
 11 | 
 12 | # run the test in a fresh sub-directory.
 13 | rm -rf mr-tmp
 14 | mkdir mr-tmp || exit 1
 15 | cd mr-tmp || exit 1
 16 | rm -f mr-*
 17 | 
 18 | # make sure software is freshly built.
 19 | (cd ../../mrapps && go build $RACE -buildmode=plugin wc.go) || exit 1
 20 | (cd ../../mrapps && go build $RACE -buildmode=plugin indexer.go) || exit 1
 21 | (cd ../../mrapps && go build $RACE -buildmode=plugin mtiming.go) || exit 1
 22 | (cd ../../mrapps && go build $RACE -buildmode=plugin rtiming.go) || exit 1
 23 | (cd ../../mrapps && go build $RACE -buildmode=plugin crash.go) || exit 1
 24 | (cd ../../mrapps && go build $RACE -buildmode=plugin nocrash.go) || exit 1
 25 | (cd .. && go build $RACE mrmaster.go) || exit 1
 26 | (cd .. && go build $RACE mrworker.go) || exit 1
 27 | (cd .. && go build $RACE mrsequential.go) || exit 1
 28 | 
 29 | # first word-count
 30 | 
 31 | # generate the correct output
 32 | ../mrsequential ../../mrapps/wc.so ../pg*txt || exit 1
 33 | sort mr-out-0 > mr-correct-wc.txt
 34 | rm -f mr-out*
 35 | 
 36 | echo '***' Starting wc test.
 37 | 
 38 | ../mrmaster ../pg*txt &
 39 | sleep 1
 40 | 
 41 | # start multiple workers
 42 | ../mrworker ../../mrapps/wc.so &
 43 | ../mrworker ../../mrapps/wc.so &
 44 | ../mrworker ../../mrapps/wc.so
 45 | 
 46 | sort mr-out* > mr-wc-all
 47 | if cmp mr-wc-all mr-correct-wc.txt
 48 | then
 49 |   echo '---' wc test: PASS
 50 | else
 51 |   echo '---' wc output is not the same as mr-correct-wc.txt
 52 |   echo '---' wc test: FAIL
 53 |   exit 1
 54 | fi
 55 | 
 56 | # now indexer
 57 | rm -f mr-*
 58 | 
 59 | # generate the correct output
 60 | ../mrsequential ../../mrapps/indexer.so ../pg*txt || exit 1
 61 | sort mr-out-0 > mr-correct-indexer.txt
 62 | rm -f mr-out*
 63 | 
 64 | echo '***' Starting indexer test.
 65 | 
 66 | ../mrmaster ../pg*txt &
 67 | sleep 1
 68 | 
 69 | # start multiple workers
 70 | ../mrworker ../../mrapps/indexer.so &
 71 | ../mrworker ../../mrapps/indexer.so
 72 | 
 73 | sort mr-out* > mr-indexer-all
 74 | if cmp mr-indexer-all mr-correct-indexer.txt
 75 | then
 76 |   echo '---' indexer test: PASS
 77 | else
 78 |   echo '---' indexer output is not the same as mr-correct-indexer.txt
 79 |   echo '---' indexer test: FAIL
 80 |   exit 1
 81 | fi
 82 | 
 83 | 
 84 | 
 85 | 
 86 | echo '***' Starting map parallelism test.
 87 | 
 88 | rm -f mr-out* mr-worker*
 89 | 
 90 | ../mrmaster ../pg*txt &
 91 | sleep 1
 92 | 
 93 | ../mrworker ../../mrapps/mtiming.so &
 94 | ../mrworker ../../mrapps/mtiming.so
 95 | 
 96 | NT=`cat mr-out* | grep '^times-' | wc -l | sed 's/ //g'`
 97 | if [ "$NT" != "2" ]
 98 | then
 99 |   echo '---' saw "$NT" workers rather than 2
100 |   echo '---' map parallelism test: FAIL
101 |   exit 1
102 | fi
103 | 
104 | if cat mr-out* | grep '^parallel.* 2' > /dev/null
105 | then
106 |   echo '---' map parallelism test: PASS
107 | else
108 |   echo '---' map workers did not run in parallel
109 |   echo '---' map parallelism test: FAIL
110 |   exit 1
111 | fi
112 | 
113 | 
114 | echo '***' Starting reduce parallelism test.
115 | 
116 | rm -f mr-out* mr-worker*
117 | 
118 | ../mrmaster ../pg*txt &
119 | sleep 1
120 | 
121 | ../mrworker ../../mrapps/rtiming.so &
122 | ../mrworker ../../mrapps/rtiming.so
123 | 
124 | NT=`cat mr-out* | grep '^[a-z] 2' | wc -l | sed 's/ //g'`
125 | if [ "$NT" -lt "2" ]
126 | then
127 |   echo '---' too few parallel reduces.
128 |   echo '---' reduce parallelism test: FAIL
129 |   exit 1
130 | else
131 |   echo '---' reduce parallelism test: PASS
132 | fi
133 | 
134 | 
135 | 
136 | # generate the correct output
137 | ../mrsequential ../../mrapps/nocrash.so ../pg*txt || exit 1
138 | sort mr-out-0 > mr-correct-crash.txt
139 | rm -f mr-out*
140 | 
141 | echo '***' Starting crash test.
142 | 
143 | rm -f mr-done
144 | (../mrmaster ../pg*txt ; touch mr-done ) &
145 | sleep 1
146 | 
147 | # start multiple workers
148 | ../mrworker ../../mrapps/crash.so &
149 | 
150 | ( while [ -e mr-socket -a ! -f mr-done ]
151 |   do
152 |     ../mrworker ../../mrapps/crash.so 
153 |     sleep 1
154 |   done ) &
155 | 
156 | ( while [ -e mr-socket -a ! -f mr-done ]
157 |   do
158 |     ../mrworker ../../mrapps/crash.so 
159 |     sleep 1
160 |   done ) &
161 | 
162 | while [ -e mr-socket -a ! -f mr-done ]
163 | do
164 |   ../mrworker ../../mrapps/crash.so 
165 |   sleep 1
166 | done
167 | 
168 | wait
169 | wait
170 | wait
171 | 
172 | rm mr-socket
173 | sort mr-out* > mr-crash-all
174 | if cmp mr-crash-all mr-correct-crash.txt
175 | then
176 |   echo '---' crash test: PASS
177 | else
178 |   echo '---' crash output is not the same as mr-correct-crash.txt
179 |   echo '---' crash test: FAIL
180 |   exit 1
181 | fi
182 | 
183 | echo '***' PASSED ALL TESTS
184 | 


--------------------------------------------------------------------------------
/src/main/viewd.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // see directions in pbc.go
 5 | //
 6 | 
 7 | import "time"
 8 | import "viewservice"
 9 | import "os"
10 | import "fmt"
11 | 
12 | func main() {
13 | 	if len(os.Args) != 2 {
14 | 		fmt.Printf("Usage: viewd port\n")
15 | 		os.Exit(1)
16 | 	}
17 | 
18 | 	viewservice.StartServer(os.Args[1])
19 | 
20 | 	for {
21 | 		time.Sleep(100 * time.Second)
22 | 	}
23 | }
24 | 


--------------------------------------------------------------------------------
/src/mr/master.go:
--------------------------------------------------------------------------------
  1 | package mr
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"log"
  6 | 	"os"
  7 | 	"sync"
  8 | 	"sync/atomic"
  9 | 	"time"
 10 | )
 11 | import "net"
 12 | import "net/rpc"
 13 | import "net/http"
 14 | 
 15 | var dispatcher *Dispatcher
 16 | 
 17 | // 主节点
 18 | type Master struct {
 19 | 	// Your definitions here.
 20 | 	S  *JobState
 21 | 	TP *TaskPool
 22 | 	W  *sync.Map
 23 | }
 24 | 
 25 | // Job 状态
 26 | type JobState struct {
 27 | 	MatrixSource [][]string // MC * RC
 28 | 	MC           int
 29 | 	RC           int
 30 | 	MCDone       int32
 31 | 	nextWorkerID uint64
 32 | 	allDone      int // 用于标示全部完成的状态 0代表没有全部完成 1 代表已经全部完成 进入优雅关闭状态
 33 | }
 34 | 
 35 | // 任务池
 36 | type TaskPool struct {
 37 | 	Pool chan *Task
 38 | }
 39 | 
 40 | // 任务
 41 | type Task struct {
 42 | 	Status int // 0 未完成 1工作中 2已完成
 43 | 	Type   int // 0 map 任务 1 reduce 任务 2 shut down 3 retry
 44 | 	Conf   *TaskConf
 45 | }
 46 | 
 47 | // 任务配置
 48 | type TaskConf struct {
 49 | 	Source []string // 兼容两种任务
 50 | 	RNum   int      // 当前 map 任务的 任务编号 如果是reduce任务 则为-1
 51 | 	MNum   int      // 当前 reduce 任务的 任务编号 如果是map任务 则为-1
 52 | 	RC     int      // reduce 的任务数
 53 | }
 54 | 
 55 | // 定时清理器
 56 | type Dispatcher struct {
 57 | 	TimeOut          time.Duration      //默认10秒
 58 | 	M                *Master            //主节点全局结构
 59 | 	ReduceSourceChan chan *ReduceSource // 发送 reduce 的任务 执行内容
 60 | 	CleanWorkerChan  chan uint64        // 清理失效的worker
 61 | }
 62 | 
 63 | // 工作者会话管理器
 64 | type WorkerSession struct {
 65 | 	WorkerID     uint64
 66 | 	Status       int // 0 空闲状态 1 工作状态 2 无法正常工作
 67 | 	T            *Task
 68 | 	Mux          *sync.RWMutex
 69 | 	LastPingTs   int64
 70 | 	PingPongChan chan struct{}
 71 | }
 72 | 
 73 | type ReduceSource struct {
 74 | 	MIdx      int
 75 | 	MapSource []string // map 任务返回的 source 列表
 76 | }
 77 | 
 78 | // Your code here -- RPC handlers for the worker to call.
 79 | 
 80 | //
 81 | // an example RPC handler.
 82 | //
 83 | func (m *Master) Example(args *ExampleArgs, reply *ExampleReply) error {
 84 | 	reply.Y = args.X + 1
 85 | 	return nil
 86 | }
 87 | 
 88 | func (m *Master) RegisterWorker(args *RegisterReq, reply *RegisterRes) error {
 89 | 	_ = args
 90 | 	for {
 91 | 		assignID := atomic.LoadUint64(&m.S.nextWorkerID)
 92 | 		if atomic.CompareAndSwapUint64(&m.S.nextWorkerID, assignID, assignID+1) {
 93 | 			reply.WorkerID = assignID
 94 | 			ws := &WorkerSession{
 95 | 				WorkerID:     assignID,
 96 | 				Status:       0,   // 0 代表 健康良好 1 代表失联
 97 | 				T:            nil, // 正在执行的任务
 98 | 				LastPingTs:   time.Now().UnixNano() / 1e6,
 99 | 				Mux:          &sync.RWMutex{},
100 | 				PingPongChan: make(chan struct{}),
101 | 			}
102 | 			m.W.Store(assignID, ws)
103 | 			go ws.PingPong(dispatcher.TimeOut)
104 | 			return nil
105 | 		}
106 | 		// TODO:不应该无限重试 应该设置一个限制
107 | 		time.Sleep(10 * time.Millisecond)
108 | 	}
109 | }
110 | 
111 | func (m *Master) GetTaskWorker(args *GetTaskReq, reply *GetTaskRes) error {
112 | 	// 延迟 5秒后 若五任务就返回
113 | 	c := time.After(5 * time.Second)
114 | 	if worker, ok := m.W.Load(args.WorkerID); ok {
115 | 		w := worker.(*WorkerSession)
116 | 		select {
117 | 		case task, ok := <-m.TP.Pool:
118 | 			if !ok {
119 | 				shutdown(reply)
120 | 				m.W.Delete(w.WorkerID)
121 | 				return nil
122 | 			}
123 | 			task.Status = 1
124 | 			reply.T = task
125 | 			w.Mux.Lock()
126 | 			defer w.Mux.Unlock()
127 | 			w.Status = 1 // 任务负载
128 | 			w.T = task
129 | 		case <-c:
130 | 			// 返回nil
131 | 		}
132 | 	} else {
133 | 		shutdown(reply)
134 | 		m.W.Delete(args.WorkerID)
135 | 	}
136 | 	return nil
137 | }
138 | 
139 | func (m *Master) ReportResult(args *ResultReq, reply *ResultRes) error {
140 | 	//fmt.Println("ReportResult", args.WorkerID, args.M, args.Code)
141 | 	if len(args.M) == 0 {
142 | 		reply.Code = 1
143 | 		reply.Msg = "The report cannot be empty"
144 | 		return nil
145 | 	}
146 | 	if ws, ok := m.W.Load(args.WorkerID); ok {
147 | 		w := ws.(*WorkerSession)
148 | 		switch args.Code {
149 | 		case 0: // map
150 | 			if w.T == nil {
151 | 				reply.Msg = "shut down!!!"
152 | 				reply.Code = 1
153 | 				return nil
154 | 			}
155 | 			dispatcher.ReduceSourceChan <- &ReduceSource{
156 | 				MIdx:      w.T.Conf.MNum,
157 | 				MapSource: args.M,
158 | 			}
159 | 		case 1: // reduce
160 | 			if w.T == nil {
161 | 				reply.Msg = "shut down!!!"
162 | 				reply.Code = 1
163 | 				return nil
164 | 			}
165 | 			m.S.MatrixSource[m.S.MC][w.T.Conf.RNum] = "done"
166 | 		case 2: // failed
167 | 			task := w.T
168 | 			m.W.Delete(args.WorkerID)
169 | 			task.Status = 0   // 重新置为 未分配状态
170 | 			m.TP.Pool <- task // 将任务重新加入队列
171 | 			reply.Code = 0
172 | 			return nil
173 | 		default:
174 | 			reply.Code = 1
175 | 			reply.Msg = fmt.Sprintf("Code %d do not recognize", args.Code)
176 | 			return nil
177 | 		}
178 | 		w.Mux.Lock()
179 | 		defer w.Mux.Unlock()
180 | 		w.Status = 0                               // 节点空闲
181 | 		w.T = nil                                  // 任务完成
182 | 		w.LastPingTs = time.Now().UnixNano() / 1e6 // 更新会话时间戳
183 | 		reply.Code = 0
184 | 		return nil
185 | 	}
186 | 	reply.Code = 1
187 | 	reply.Msg = "unregistered"
188 | 	return nil
189 | }
190 | 
191 | func (m *Master) PingPong(args *Ping, reply *Pong) error {
192 | 	if ws, ok := m.W.Load(args.WorkerID); ok {
193 | 		w := ws.(*WorkerSession)
194 | 		w.Mux.Lock()
195 | 		defer w.Mux.Unlock()
196 | 		w.LastPingTs = time.Now().UnixNano() / 1e6 // 更新会话时间戳
197 | 		w.PingPongChan <- struct{}{}
198 | 	}
199 | 	reply.Code = 0
200 | 	return nil
201 | }
202 | 
203 | func shutdown(reply *GetTaskRes) {
204 | 	reply.Msg = "shut down!!!"
205 | 	reply.T = &Task{
206 | 		Status: 0,
207 | 		Type:   2,
208 | 		Conf:   &TaskConf{Source: []string{}},
209 | 	}
210 | }
211 | 
212 | func (d *Dispatcher) cleanSession() {
213 | 	for workerID := range d.CleanWorkerChan {
214 | 		if w, ok := d.M.W.Load(workerID); ok {
215 | 			worker := w.(*WorkerSession)
216 | 			worker.Mux.Lock()
217 | 			task := worker.T
218 | 			worker.T = nil
219 | 			worker.Mux.Unlock()
220 | 			if task != nil {
221 | 				task.Status = 0
222 | 				//fmt.Println("cleanSession.task",workerID,task.Status,task.Conf.Source)
223 | 				d.M.TP.Pool <- task
224 | 			}
225 | 			d.M.W.Delete(worker)
226 | 			//fmt.Println("cleanSession.worker",workerID)
227 | 		}
228 | 	}
229 | }
230 | 
231 | func (d *Dispatcher) updateJobState() {
232 | 	for rs := range d.ReduceSourceChan {
233 | 		d.M.S.MatrixSource[rs.MIdx] = rs.MapSource
234 | 		atomic.AddInt32(&d.M.S.MCDone, 1)
235 | 		if atomic.LoadInt32(&d.M.S.MCDone) == int32(d.M.S.MC) {
236 | 			//fmt.Println(d.M.S.MCDone)
237 | 			for j := 0; j < d.M.S.RC; j++ {
238 | 				sources := make([]string, 0)
239 | 				for i := 0; i < d.M.S.MC; i++ {
240 | 					sources = append(sources, d.M.S.MatrixSource[i][j])
241 | 				}
242 | 				d.M.TP.Pool <- &Task{
243 | 					Status: 0,
244 | 					Type:   1, // Reduce 任务
245 | 					Conf: &TaskConf{
246 | 						Source: sources,
247 | 						RNum:   j,
248 | 						MNum:   -1,
249 | 						RC:     d.M.S.RC,
250 | 					},
251 | 				}
252 | 				d.M.S.MatrixSource[d.M.S.MC][j] = "created"
253 | 				//fmt.Println(sources, d.M.S.MatrixSource[d.M.S.MC][j])
254 | 			}
255 | 		}
256 | 	}
257 | }
258 | 
259 | func (d *Dispatcher) run() {
260 | 	go d.cleanSession()
261 | 	go d.updateJobState()
262 | }
263 | 
264 | func (w *WorkerSession) PingPong(ts time.Duration) {
265 | 	for {
266 | 		tc := time.NewTicker(ts)
267 | 		select {
268 | 		case _ = <-tc.C:
269 | 			dispatcher.CleanWorkerChan <- w.WorkerID
270 | 		case _ = <-w.PingPongChan:
271 | 			tc.Stop()
272 | 			// TODO: 这里应该 有一个 close 信号将协程退出 否则程序中会存在大量无用的协程 存在泄露的风险
273 | 		}
274 | 	}
275 | }
276 | 
277 | //
278 | // start a thread that listens for RPCs from worker.go
279 | //
280 | func (m *Master) server() {
281 | 	if err := rpc.Register(m); err != nil {
282 | 		panic(err)
283 | 	}
284 | 	rpc.HandleHTTP()
285 | 	//l, e := net.Listen("tcp", ":1234")
286 | 	_ = os.Remove("mr-socket")
287 | 	l, e := net.Listen("unix", "mr-socket")
288 | 	if e != nil {
289 | 		log.Fatal("listen error:", e)
290 | 	}
291 | 	go func() {
292 | 		if err := http.Serve(l, nil); err != nil {
293 | 			panic(err)
294 | 		}
295 | 	}()
296 | }
297 | 
298 | //
299 | // main/mrmaster.go calls Done() periodically to find out
300 | // if the entire job has finished.
301 | //
302 | func (m *Master) Done() bool {
303 | 	ret := false
304 | 	// Your code here.
305 | 	count := 0
306 | 	for _, v := range m.S.MatrixSource[m.S.MC] {
307 | 		if v == "done" {
308 | 			count++
309 | 		}
310 | 	}
311 | 	if count == m.S.RC {
312 | 		//fmt.Println(m.S.allDone)
313 | 		//close(dispatcher.CleanWorkerChan)
314 | 		//close(dispatcher.ReduceSourceChan)
315 | 		if len(m.TP.Pool) != 0 {
316 | 			return false
317 | 		}
318 | 		if m.S.allDone == 0 {
319 | 			close(m.TP.Pool) // 将会通知所有 worker 进行下线
320 | 			m.S.allDone = 1
321 | 		}
322 | 		c := 0
323 | 		m.W.Range(func(key, value interface{}) bool {
324 | 			w := value.(*WorkerSession)
325 | 			if w.T != nil {
326 | 				c++
327 | 			}
328 | 			return true
329 | 		})
330 | 		if c == 0 {
331 | 			ret = true
332 | 			// TODO: 一个完美主义者 不想让命令行打印出来一些无关紧要的东西(又不想改框架本身)
333 | 			_ = os.Remove("mr-socket")
334 | 			_, _ = os.Create("mr-socket")
335 | 		}
336 | 	}
337 | 	return ret
338 | }
339 | 
340 | //
341 | // create a Master.
342 | //
343 | func MakeMaster(files []string, nReduce int) *Master {
344 | 	m := Master{}
345 | 	// Your code here.
346 | 	sources := make([][]string, len(files)+1) // 多出一行保存完成状态
347 | 	for i := 0; i < len(sources); i++ {
348 | 		sources[i] = make([]string, nReduce)
349 | 	}
350 | 	m.S = &JobState{
351 | 		MatrixSource: sources,
352 | 		MC:           len(files),
353 | 		RC:           nReduce,
354 | 		nextWorkerID: uint64(0),
355 | 	}
356 | 	m.TP = &TaskPool{Pool: make(chan *Task, len(files))}
357 | 	m.W = &sync.Map{}
358 | 
359 | 	dispatcher = &Dispatcher{
360 | 		TimeOut:          10 * time.Second,
361 | 		M:                &m,
362 | 		ReduceSourceChan: make(chan *ReduceSource, nReduce),
363 | 		CleanWorkerChan:  make(chan uint64, len(files)),
364 | 	}
365 | 	dispatcher.run()
366 | 	// 初始化map任务
367 | 	for num, file := range files {
368 | 		m.TP.Pool <- &Task{
369 | 			Status: 0, // 0 未完成 1工作中 2已完成
370 | 			Type:   0, // 0 map 任务 1 reduce 任务 2 shut down 3 retry
371 | 			Conf:   &TaskConf{Source: []string{file}, MNum: num, RNum: -1, RC: nReduce},
372 | 		}
373 | 	}
374 | 	m.server()
375 | 	return &m
376 | }
377 | 


--------------------------------------------------------------------------------
/src/mr/rpc.go:
--------------------------------------------------------------------------------
 1 | package mr
 2 | 
 3 | //
 4 | // RPC definitions.
 5 | //
 6 | 
 7 | //
 8 | // example to show how to declare the arguments
 9 | // and reply for an RPC.
10 | //
11 | 
12 | type ExampleArgs struct {
13 | 	X int
14 | }
15 | 
16 | type ExampleReply struct {
17 | 	Y int
18 | }
19 | 
20 | // Add your RPC definitions here.
21 | // 注册
22 | type RegisterReq struct {
23 | }
24 | 
25 | type RegisterRes struct {
26 | 	WorkerID uint64
27 | }
28 | 
29 | // 获取任务
30 | type GetTaskReq struct {
31 | 	WorkerID uint64
32 | }
33 | type GetTaskRes struct {
34 | 	Code int
35 | 	Msg  string
36 | 	T    *Task
37 | }
38 | 
39 | // 返回结果
40 | type ResultReq struct {
41 | 	WorkerID uint64
42 | 	Code     int // 0 代表 map 1 代表 reduce 2代表 失败
43 | 	Msg      string
44 | 	M        []string
45 | }
46 | 
47 | type ResultRes struct {
48 | 	Code int
49 | 	Msg  string
50 | }
51 | 
52 | // 健康检查
53 | type Ping struct {
54 | 	WorkerID uint64
55 | }
56 | 
57 | type Pong struct {
58 | 	Code int
59 | }
60 | 


--------------------------------------------------------------------------------
/src/mr/worker.go:
--------------------------------------------------------------------------------
  1 | package mr
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"fmt"
  6 | 	"io"
  7 | 	"io/ioutil"
  8 | 	"os"
  9 | 	"sort"
 10 | 	"strings"
 11 | 	"time"
 12 | )
 13 | import "log"
 14 | import "net/rpc"
 15 | import "hash/fnv"
 16 | 
 17 | //
 18 | // Map functions return a slice of KeyValue.
 19 | //
 20 | type KeyValue struct {
 21 | 	Key   string
 22 | 	Value string
 23 | }
 24 | 
 25 | // for sorting by key.
 26 | type ByKey []KeyValue
 27 | 
 28 | // for sorting by key.
 29 | func (a ByKey) Len() int           { return len(a) }
 30 | func (a ByKey) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
 31 | func (a ByKey) Less(i, j int) bool { return a[i].Key < a[j].Key }
 32 | 
 33 | //
 34 | // use ihash(key) % NReduce to choose the reduce
 35 | // task number for each KeyValue emitted by Map.
 36 | //
 37 | func ihash(key string) int {
 38 | 	h := fnv.New32a()
 39 | 	h.Write([]byte(key))
 40 | 	return int(h.Sum32() & 0x7fffffff)
 41 | }
 42 | 
 43 | var workerID uint64
 44 | 
 45 | func Worker(mapf func(string, string) []KeyValue,
 46 | 	reducef func(string, []string) string) {
 47 | 	// Your worker implementation here.
 48 | 	workerID = Register()
 49 | 	go func() {
 50 | 		tc := time.NewTicker(10 * time.Second)
 51 | 		defer tc.Stop()
 52 | 		for {
 53 | 			<-tc.C
 54 | 			PingPong()
 55 | 		}
 56 | 	}()
 57 | 	var task *Task
 58 | 	for {
 59 | 		// 1.获取任务
 60 | 		task = GetTask()
 61 | 		// 2.根据任务类型执行任务
 62 | 		res, err := ExecTask(mapf, reducef, task)
 63 | 		if err != nil {
 64 | 			continue
 65 | 		}
 66 | 		// 3.报告结果
 67 | 		Report(res)
 68 | 	}
 69 | }
 70 | 
 71 | //
 72 | // example function to show how to make an RPC call to the master.
 73 | //
 74 | 
 75 | const (
 76 | 	CallRegister = "Master.RegisterWorker"
 77 | 	CallPingPong = "Master.PingPong"
 78 | 	CallGetTask  = "Master.GetTaskWorker"
 79 | 	CallReport   = "Master.ReportResult"
 80 | )
 81 | 
 82 | func CallExample() {
 83 | 
 84 | 	// declare an argument structure.
 85 | 	args := ExampleArgs{}
 86 | 
 87 | 	// fill in the argument(s).
 88 | 	args.X = 99
 89 | 
 90 | 	// declare a reply structure.
 91 | 	reply := ExampleReply{}
 92 | 
 93 | 	// send the RPC request, wait for the reply.
 94 | 	call("Master.Example", &args, &reply)
 95 | 
 96 | 	// reply.Y should be 100.
 97 | 	fmt.Printf("reply.Y %v\n", reply.Y)
 98 | }
 99 | 
100 | // TODO: 是否可以包装成对象
101 | func Register() uint64 {
102 | 	args, reply := RegisterReq{}, RegisterRes{}
103 | 	// TODO: 处理请求异常?
104 | 	call(CallRegister, &args, &reply)
105 | 	return reply.WorkerID
106 | }
107 | func PingPong() {
108 | 	args, reply := Ping{WorkerID: workerID}, Pong{}
109 | 	call(CallPingPong, &args, &reply)
110 | 	//TODO:解析PingPong异常?
111 | }
112 | func GetTask() *Task {
113 | 	args, reply := GetTaskReq{WorkerID: workerID}, GetTaskRes{}
114 | 	call(CallGetTask, &args, &reply)
115 | 	//TODO:解析异常?
116 | 	return reply.T
117 | }
118 | func ExecTask(mapf func(string, string) []KeyValue,
119 | 	reducef func(string, []string) string, task *Task) (*ResultReq, error) {
120 | 	var res []string
121 | 	req := &ResultReq{WorkerID: workerID}
122 | 	if task == nil {
123 | 		return nil, fmt.Errorf("retry") // TODO:应该定义一种错误类型
124 | 	}
125 | 	if task.Type == 0 {
126 | 		// TODO: 执行 map 任务
127 | 		res, _ = doMap(mapf, task)
128 | 		req.Code = 0
129 | 	} else if task.Type == 1 {
130 | 		// TODO: 执行 reduce 任务
131 | 		res, _ = doReduce(reducef, task)
132 | 		req.Code = 1
133 | 	} else if task.Type == 2 {
134 | 		os.Exit(0)
135 | 	}
136 | 	if len(res) == 0 {
137 | 		return nil, fmt.Errorf("retry") // TODO:应该定义一种错误类型
138 | 	}
139 | 	req.M = res
140 | 	return req, nil
141 | }
142 | 
143 | func Report(res *ResultReq) {
144 | 	reply := ResultRes{}
145 | 	call(CallReport, res, &reply)
146 | 	//TODO:处理返回异常?
147 | }
148 | 
149 | func doMap(mapf func(string, string) []KeyValue, task *Task) ([]string, error) {
150 | 	// TODO:对task进行检查
151 | 	res := make([]string, 0)
152 | 	fileName := task.Conf.Source[0]
153 | 	file, err := os.Open(fileName)
154 | 	defer func() {
155 | 		_ = file.Close()
156 | 	}()
157 | 	if err != nil {
158 | 		return nil, fmt.Errorf("doMap.Open.err:%s", err.Error())
159 | 	}
160 | 	content, err := ioutil.ReadAll(file)
161 | 	if err != nil {
162 | 		return nil, fmt.Errorf(fmt.Sprintf("doMap.ReadAll.err:%s", err.Error()))
163 | 	}
164 | 	cacheMap := make(map[string][]KeyValue, 0)
165 | 	for i := 0; i < task.Conf.RC; i++ {
166 | 		key := fmt.Sprintf("mr-worker-%d-%d.out", task.Conf.MNum, i)
167 | 		cacheMap[key] = []KeyValue{}
168 | 		res = append(res, key)
169 | 	}
170 | 	kva := mapf(fileName, string(content))
171 | 	for i := 0; i < len(kva); i++ {
172 | 		idx := ihash(kva[i].Key) % task.Conf.RC // TODO: ihash(kva[i].Key) & (task.Conf.RC - 1)
173 | 		key := fmt.Sprintf("mr-worker-%d-%d.out", task.Conf.MNum, idx)
174 | 		cacheMap[key] = append(cacheMap[key], kva[i])
175 | 	}
176 | 
177 | 	for key, value := range cacheMap {
178 | 		sort.Sort(ByKey(value))
179 | 		// TODO: 在这里可以调用一次 reduce函数进行合并 以减少网络调用
180 | 		combine(value)
181 | 		// TODO: 这里是否也存在的 map函数生成文件的幂等问题
182 | 		outFile, _ := os.Create(key)
183 | 		for i := 0; i < len(value); i++ {
184 | 			_, _ = fmt.Fprintf(outFile, "%v %v\n", value[i].Key, value[i].Value)
185 | 		}
186 | 		_ = outFile.Close()
187 | 	}
188 | 	return res, nil
189 | }
190 | 
191 | func doReduce(reducef func(string, []string) string, task *Task) ([]string, error) {
192 | 	// TODO: 检查task
193 | 	// TODO: 外部排序? -> 先实现一个内存排序吧
194 | 	kvas := readFiles(task)
195 | 	tmpFileName := fmt.Sprintf("mr-out-%d.%d.swap", time.Now().Unix(), task.Conf.RNum)
196 | 	outFile, _ := os.OpenFile(tmpFileName, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0664)
197 | 	defer outFile.Close()
198 | 	// TODO:先在内存排个顺序
199 | 	sort.Sort(ByKey(kvas))
200 | 	// TODO: 先这样处理
201 | 	if len(kvas) == 0 {
202 | 		key := fmt.Sprintf("mr-out-%d", task.Conf.RNum)
203 | 		_ = os.Rename(tmpFileName, key)
204 | 		return []string{key}, nil
205 | 	}
206 | 	buf := []KeyValue{kvas[0]}
207 | 	// TODO:处理边界
208 | 	// [a,a,a,a,a,b,b,b,b,c,c,c]
209 | 	for i := 1; i < len(kvas); i++ {
210 | 		if buf[len(buf)-1].Key == kvas[i].Key { // buf 中最后的一个key 与当前key 相同
211 | 			buf = append(buf, kvas[i])
212 | 		} else {
213 | 			out := reducef(buf[len(buf)-1].Key, toValues(buf))
214 | 			_, _ = fmt.Fprintf(outFile, "%v %v\n", buf[len(buf)-1].Key, out)
215 | 			buf = []KeyValue{kvas[i]}
216 | 		}
217 | 	}
218 | 	// 写入最后的buf进去
219 | 	out := reducef(buf[len(buf)-1].Key, toValues(buf))
220 | 	_, _ = fmt.Fprintf(outFile, "%v %v\n", buf[len(buf)-1].Key, out)
221 | 	//TODO:处理路径问题?
222 | 	key := fmt.Sprintf("mr-out-%d", task.Conf.RNum)
223 | 	_ = os.Rename(tmpFileName, key)
224 | 	return []string{key}, nil
225 | }
226 | 
227 | func toValues(kvas []KeyValue) []string {
228 | 	res := make([]string, 0)
229 | 	for _, kv := range kvas {
230 | 		res = append(res, kv.Value)
231 | 	}
232 | 	return res
233 | }
234 | func readFiles(task *Task) []KeyValue {
235 | 	// TODO: 生产级别应该实现 外部排序并返回一个迭代器
236 | 	// TODO: 进行错误处理
237 | 	res := make([]KeyValue, 0)
238 | 	for _, v := range task.Conf.Source {
239 | 		file, _ := os.Open(v)
240 | 		br := bufio.NewReader(file)
241 | 		for {
242 | 			line, _, c := br.ReadLine()
243 | 			if c == io.EOF {
244 | 				break
245 | 			}
246 | 			data := strings.Split(string(line), " ")
247 | 			res = append(res, KeyValue{
248 | 				Key:   data[0],
249 | 				Value: data[1],
250 | 			})
251 | 		}
252 | 		_ = file.Close()
253 | 	}
254 | 	return res
255 | }
256 | func combine(intermediate []KeyValue) {
257 | 
258 | }
259 | func sendTaskFail(task *Task) {
260 | }
261 | 
262 | //
263 | // send an RPC request to the master, wait for the response.
264 | // usually returns true.
265 | // returns false if something goes wrong.
266 | //
267 | func call(rpcname string, args interface{}, reply interface{}) bool {
268 | 	//c, err := rpc.DialHTTP("tcp", "127.0.0.1"+":1234")
269 | 	c, err := rpc.DialHTTP("unix", "mr-socket")
270 | 	if err != nil {
271 | 		log.Fatal("dialing:", err)
272 | 	}
273 | 	defer c.Close()
274 | 
275 | 	err = c.Call(rpcname, args, reply)
276 | 	if err == nil {
277 | 		return true
278 | 	}
279 | 
280 | 	fmt.Println(err)
281 | 	return false
282 | }
283 | 


--------------------------------------------------------------------------------
/src/mrapps/crash.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // a MapReduce pseudo-application that sometimes crashes,
 5 | // and sometimes takes a long time,
 6 | // to test MapReduce's ability to recover.
 7 | //
 8 | // go build -buildmode=plugin crash.go
 9 | //
10 | 
11 | import "Mit6824/src/mr"
12 | import crand "crypto/rand"
13 | import "math/big"
14 | import "strings"
15 | import "os"
16 | import "sort"
17 | import "strconv"
18 | import "time"
19 | 
20 | func maybeCrash() {
21 | 	max := big.NewInt(1000)
22 | 	rr, _ := crand.Int(crand.Reader, max)
23 | 	if rr.Int64() < 330 {
24 | 		// crash!
25 | 		os.Exit(1)
26 | 	} else if rr.Int64() < 660 {
27 | 		// delay for a while.
28 | 		maxms := big.NewInt(10 * 1000)
29 | 		ms, _ := crand.Int(crand.Reader, maxms)
30 | 		time.Sleep(time.Duration(ms.Int64()) * time.Millisecond)
31 | 	}
32 | }
33 | 
34 | func Map(filename string, contents string) []mr.KeyValue {
35 | 	maybeCrash()
36 | 
37 | 	kva := []mr.KeyValue{}
38 | 	kva = append(kva, mr.KeyValue{"a", filename})
39 | 	kva = append(kva, mr.KeyValue{"b", strconv.Itoa(len(filename))})
40 | 	kva = append(kva, mr.KeyValue{"c", strconv.Itoa(len(contents))})
41 | 	kva = append(kva, mr.KeyValue{"d", "xyzzy"})
42 | 	return kva
43 | }
44 | 
45 | func Reduce(key string, values []string) string {
46 | 	maybeCrash()
47 | 
48 | 	// sort values to ensure deterministic output.
49 | 	vv := make([]string, len(values))
50 | 	copy(vv, values)
51 | 	sort.Strings(vv)
52 | 
53 | 	val := strings.Join(vv, " ")
54 | 	return val
55 | }
56 | 


--------------------------------------------------------------------------------
/src/mrapps/indexer.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // an indexing application "plugin" for MapReduce.
 5 | //
 6 | // go build -buildmode=plugin indexer.go
 7 | //
 8 | 
 9 | import "fmt"
10 | import "Mit6824/src/mr"
11 | 
12 | import "strings"
13 | import "unicode"
14 | import "sort"
15 | 
16 | // The mapping function is called once for each piece of the input.
17 | // In this framework, the key is the name of the file that is being processed,
18 | // and the value is the file's contents. The return value should be a slice of
19 | // key/value pairs, each represented by a mr.KeyValue.
20 | func Map(document string, value string) (res []mr.KeyValue) {
21 | 	m := make(map[string]bool)
22 | 	words := strings.FieldsFunc(value, func(x rune) bool { return !unicode.IsLetter(x) })
23 | 	for _, w := range words {
24 | 		m[w] = true
25 | 	}
26 | 	for w := range m {
27 | 		kv := mr.KeyValue{w, document}
28 | 		res = append(res, kv)
29 | 	}
30 | 	return
31 | }
32 | 
33 | // The reduce function is called once for each key generated by Map, with a
34 | // list of that key's string value (merged across all inputs). The return value
35 | // should be a single output value for that key.
36 | func Reduce(key string, values []string) string {
37 | 	sort.Strings(values)
38 | 	return fmt.Sprintf("%d %s", len(values), strings.Join(values, ","))
39 | }
40 | 


--------------------------------------------------------------------------------
/src/mrapps/mtiming.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // a MapReduce pseudo-application to test that workers
 5 | // execute map tasks in parallel.
 6 | //
 7 | // go build -buildmode=plugin mtiming.go
 8 | //
 9 | 
10 | import "Mit6824/src/mr"
11 | import "strings"
12 | import "fmt"
13 | import "os"
14 | import "syscall"
15 | import "time"
16 | import "sort"
17 | import "io/ioutil"
18 | 
19 | func nparallel(phase string) int {
20 | 	// create a file so that other workers will see that
21 | 	// we're running at the same time as them.
22 | 	pid := os.Getpid()
23 | 	myfilename := fmt.Sprintf("mr-worker-%s-%d", phase, pid)
24 | 	err := ioutil.WriteFile(myfilename, []byte("x"), 0666)
25 | 	if err != nil {
26 | 		panic(err)
27 | 	}
28 | 
29 | 	// are any other workers running?
30 | 	// find their PIDs by scanning directory for mr-worker-XXX files.
31 | 	dd, err := os.Open(".")
32 | 	if err != nil {
33 | 		panic(err)
34 | 	}
35 | 	names, err := dd.Readdirnames(1000000)
36 | 	if err != nil {
37 | 		panic(err)
38 | 	}
39 | 	ret := 0
40 | 	for _, name := range names {
41 | 		var xpid int
42 | 		pat := fmt.Sprintf("mr-worker-%s-%%d", phase)
43 | 		n, err := fmt.Sscanf(name, pat, &xpid)
44 | 		if n == 1 && err == nil {
45 | 			err := syscall.Kill(xpid, 0)
46 | 			if err == nil {
47 | 				// if err == nil, xpid is alive.
48 | 				ret += 1
49 | 			}
50 | 		}
51 | 	}
52 | 	dd.Close()
53 | 
54 | 	time.Sleep(1 * time.Second)
55 | 
56 | 	err = os.Remove(myfilename)
57 | 	if err != nil {
58 | 		panic(err)
59 | 	}
60 | 
61 | 	return ret
62 | }
63 | 
64 | func Map(filename string, contents string) []mr.KeyValue {
65 | 	t0 := time.Now()
66 | 	ts := float64(t0.Unix()) + (float64(t0.Nanosecond()) / 1000000000.0)
67 | 	pid := os.Getpid()
68 | 
69 | 	n := nparallel("map")
70 | 
71 | 	kva := []mr.KeyValue{}
72 | 	kva = append(kva, mr.KeyValue{
73 | 		fmt.Sprintf("times-%v", pid),
74 | 		fmt.Sprintf("%.1f", ts)})
75 | 	kva = append(kva, mr.KeyValue{
76 | 		fmt.Sprintf("parallel-%v", pid),
77 | 		fmt.Sprintf("%d", n)})
78 | 	return kva
79 | }
80 | 
81 | func Reduce(key string, values []string) string {
82 | 	//n := nparallel("reduce")
83 | 
84 | 	// sort values to ensure deterministic output.
85 | 	vv := make([]string, len(values))
86 | 	copy(vv, values)
87 | 	sort.Strings(vv)
88 | 
89 | 	val := strings.Join(vv, " ")
90 | 	return val
91 | }
92 | 


--------------------------------------------------------------------------------
/src/mrapps/nocrash.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // same as crash.go but doesn't actually crash.
 5 | //
 6 | // go build -buildmode=plugin nocrash.go
 7 | //
 8 | 
 9 | import "Mit6824/src/mr"
10 | import crand "crypto/rand"
11 | import "math/big"
12 | import "strings"
13 | import "os"
14 | import "sort"
15 | import "strconv"
16 | 
17 | func maybeCrash() {
18 | 	max := big.NewInt(1000)
19 | 	rr, _ := crand.Int(crand.Reader, max)
20 | 	if false && rr.Int64() < 500 {
21 | 		// crash!
22 | 		os.Exit(1)
23 | 	}
24 | }
25 | 
26 | func Map(filename string, contents string) []mr.KeyValue {
27 | 	maybeCrash()
28 | 
29 | 	kva := []mr.KeyValue{}
30 | 	kva = append(kva, mr.KeyValue{"a", filename})
31 | 	kva = append(kva, mr.KeyValue{"b", strconv.Itoa(len(filename))})
32 | 	kva = append(kva, mr.KeyValue{"c", strconv.Itoa(len(contents))})
33 | 	kva = append(kva, mr.KeyValue{"d", "xyzzy"})
34 | 	return kva
35 | }
36 | 
37 | func Reduce(key string, values []string) string {
38 | 	maybeCrash()
39 | 
40 | 	// sort values to ensure deterministic output.
41 | 	vv := make([]string, len(values))
42 | 	copy(vv, values)
43 | 	sort.Strings(vv)
44 | 
45 | 	val := strings.Join(vv, " ")
46 | 	return val
47 | }
48 | 


--------------------------------------------------------------------------------
/src/mrapps/rtiming.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // a MapReduce pseudo-application to test that workers
 5 | // execute reduce tasks in parallel.
 6 | //
 7 | // go build -buildmode=plugin rtiming.go
 8 | //
 9 | 
10 | import "Mit6824/src/mr"
11 | import "fmt"
12 | import "os"
13 | import "syscall"
14 | import "time"
15 | import "io/ioutil"
16 | 
17 | func nparallel(phase string) int {
18 | 	// create a file so that other workers will see that
19 | 	// we're running at the same time as them.
20 | 	pid := os.Getpid()
21 | 	myfilename := fmt.Sprintf("mr-worker-%s-%d", phase, pid)
22 | 	err := ioutil.WriteFile(myfilename, []byte("x"), 0666)
23 | 	if err != nil {
24 | 		panic(err)
25 | 	}
26 | 
27 | 	// are any other workers running?
28 | 	// find their PIDs by scanning directory for mr-worker-XXX files.
29 | 	dd, err := os.Open(".")
30 | 	if err != nil {
31 | 		panic(err)
32 | 	}
33 | 	names, err := dd.Readdirnames(1000000)
34 | 	if err != nil {
35 | 		panic(err)
36 | 	}
37 | 	ret := 0
38 | 	for _, name := range names {
39 | 		var xpid int
40 | 		pat := fmt.Sprintf("mr-worker-%s-%%d", phase)
41 | 		n, err := fmt.Sscanf(name, pat, &xpid)
42 | 		if n == 1 && err == nil {
43 | 			err := syscall.Kill(xpid, 0)
44 | 			if err == nil {
45 | 				// if err == nil, xpid is alive.
46 | 				ret += 1
47 | 			}
48 | 		}
49 | 	}
50 | 	dd.Close()
51 | 
52 | 	time.Sleep(1 * time.Second)
53 | 
54 | 	err = os.Remove(myfilename)
55 | 	if err != nil {
56 | 		panic(err)
57 | 	}
58 | 
59 | 	return ret
60 | }
61 | 
62 | func Map(filename string, contents string) []mr.KeyValue {
63 | 
64 | 	kva := []mr.KeyValue{}
65 | 	kva = append(kva, mr.KeyValue{"a", "1"})
66 | 	kva = append(kva, mr.KeyValue{"b", "1"})
67 | 	kva = append(kva, mr.KeyValue{"c", "1"})
68 | 	kva = append(kva, mr.KeyValue{"d", "1"})
69 | 	kva = append(kva, mr.KeyValue{"e", "1"})
70 | 	kva = append(kva, mr.KeyValue{"f", "1"})
71 | 	kva = append(kva, mr.KeyValue{"g", "1"})
72 | 	kva = append(kva, mr.KeyValue{"h", "1"})
73 | 	kva = append(kva, mr.KeyValue{"i", "1"})
74 | 	kva = append(kva, mr.KeyValue{"j", "1"})
75 | 	return kva
76 | }
77 | 
78 | func Reduce(key string, values []string) string {
79 | 	n := nparallel("reduce")
80 | 
81 | 	val := fmt.Sprintf("%d", n)
82 | 
83 | 	return val
84 | }
85 | 


--------------------------------------------------------------------------------
/src/mrapps/wc.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // a word-count application "plugin" for MapReduce.
 5 | //
 6 | // go build -buildmode=plugin wc.go
 7 | //
 8 | 
 9 | import "Mit6824/src/mr"
10 | import "unicode"
11 | import "strings"
12 | import "strconv"
13 | 
14 | //
15 | // The map function is called once for each file of input. The first
16 | // argument is the name of the input file, and the second is the
17 | // file's complete contents. You should ignore the input file name,
18 | // and look only at the contents argument. The return value is a slice
19 | // of key/value pairs.
20 | //
21 | func Map(filename string, contents string) []mr.KeyValue {
22 | 	// function to detect word separators.
23 | 	ff := func(r rune) bool { return !unicode.IsLetter(r) }
24 | 
25 | 	// split contents into an array of words.
26 | 	words := strings.FieldsFunc(contents, ff)
27 | 
28 | 	kva := []mr.KeyValue{}
29 | 	for _, w := range words {
30 | 		kv := mr.KeyValue{w, "1"}
31 | 		kva = append(kva, kv)
32 | 	}
33 | 	return kva
34 | }
35 | 
36 | //
37 | // The reduce function is called once for each key generated by the
38 | // map tasks, with a list of all the values created for that key by
39 | // any map task.
40 | //
41 | func Reduce(key string, values []string) string {
42 | 	// return the number of occurrences of this word.
43 | 	return strconv.Itoa(len(values))
44 | }
45 | 


--------------------------------------------------------------------------------
/src/mroriginal/master.go:
--------------------------------------------------------------------------------
 1 | package mroriginal
 2 | 
 3 | import (
 4 | 	"Mit6824/src/mr"
 5 | 	"log"
 6 | )
 7 | import "net"
 8 | import "os"
 9 | import "net/rpc"
10 | import "net/http"
11 | 
12 | type Master struct {
13 | 	// Your definitions here.
14 | 
15 | }
16 | 
17 | // Your code here -- RPC handlers for the worker to call.
18 | 
19 | //
20 | // an example RPC handler.
21 | //
22 | func (m *Master) Example(args *mr.ExampleArgs, reply *mr.ExampleReply) error {
23 | 	reply.Y = args.X + 1
24 | 	return nil
25 | }
26 | 
27 | //
28 | // start a thread that listens for RPCs from worker.go
29 | //
30 | func (m *Master) server() {
31 | 	rpc.Register(m)
32 | 	rpc.HandleHTTP()
33 | 	//l, e := net.Listen("tcp", ":1234")
34 | 	os.Remove("mr-socket")
35 | 	l, e := net.Listen("unix", "mr-socket")
36 | 	if e != nil {
37 | 		log.Fatal("listen error:", e)
38 | 	}
39 | 	go http.Serve(l, nil)
40 | }
41 | 
42 | //
43 | // main/mrmaster.go calls Done() periodically to find out
44 | // if the entire job has finished.
45 | //
46 | func (m *Master) Done() bool {
47 | 	ret := false
48 | 
49 | 	// Your code here.
50 | 
51 | 	return ret
52 | }
53 | 
54 | //
55 | // create a Master.
56 | //
57 | func MakeMaster(files []string, nReduce int) *Master {
58 | 	m := Master{}
59 | 
60 | 	// Your code here.
61 | 
62 | 	return &m
63 | }
64 | 


--------------------------------------------------------------------------------
/src/mroriginal/rpc.go:
--------------------------------------------------------------------------------
 1 | package mroriginal
 2 | 
 3 | //
 4 | // RPC definitions.
 5 | //
 6 | 
 7 | //
 8 | // example to show how to declare the arguments
 9 | // and reply for an RPC.
10 | //
11 | 
12 | type ExampleArgs struct {
13 | 	X int
14 | }
15 | 
16 | type ExampleReply struct {
17 | 	Y int
18 | }
19 | 
20 | // Add your RPC definitions here.
21 | 


--------------------------------------------------------------------------------
/src/mroriginal/worker.go:
--------------------------------------------------------------------------------
 1 | package mroriginal
 2 | 
 3 | import (
 4 | 	"Mit6824/src/mr"
 5 | 	"fmt"
 6 | )
 7 | import "log"
 8 | import "net/rpc"
 9 | import "hash/fnv"
10 | 
11 | //
12 | // Map functions return a slice of KeyValue.
13 | //
14 | type KeyValue struct {
15 | 	Key   string
16 | 	Value string
17 | }
18 | 
19 | //
20 | // use ihash(key) % NReduce to choose the reduce
21 | // task number for each KeyValue emitted by Map.
22 | //
23 | func ihash(key string) int {
24 | 	h := fnv.New32a()
25 | 	h.Write([]byte(key))
26 | 	return int(h.Sum32() & 0x7fffffff)
27 | }
28 | 
29 | func Worker(mapf func(string, string) []KeyValue,
30 | 	reducef func(string, []string) string) {
31 | 
32 | 	// Your worker implementation here.
33 | 
34 | 	// uncomment to send the Example RPC to the master.
35 | 	// CallExample()
36 | }
37 | 
38 | //
39 | // example function to show how to make an RPC call to the master.
40 | //
41 | func CallExample() {
42 | 
43 | 	// declare an argument structure.
44 | 	args := mr.ExampleArgs{}
45 | 
46 | 	// fill in the argument(s).
47 | 	args.X = 99
48 | 
49 | 	// declare a reply structure.
50 | 	reply := mr.ExampleReply{}
51 | 
52 | 	// send the RPC request, wait for the reply.
53 | 	call("Master.Example", &args, &reply)
54 | 
55 | 	// reply.Y should be 100.
56 | 	fmt.Printf("reply.Y %v\n", reply.Y)
57 | }
58 | 
59 | //
60 | // send an RPC request to the master, wait for the response.
61 | // usually returns true.
62 | // returns false if something goes wrong.
63 | //
64 | func call(rpcname string, args interface{}, reply interface{}) bool {
65 | 	// c, err := rpc.DialHTTP("tcp", "127.0.0.1"+":1234")
66 | 	c, err := rpc.DialHTTP("unix", "mr-socket")
67 | 	if err != nil {
68 | 		log.Fatal("dialing:", err)
69 | 	}
70 | 	defer c.Close()
71 | 
72 | 	err = c.Call(rpcname, args, reply)
73 | 	if err == nil {
74 | 		return true
75 | 	}
76 | 
77 | 	fmt.Println(err)
78 | 	return false
79 | }
80 | 


--------------------------------------------------------------------------------
/src/raft/config.go:
--------------------------------------------------------------------------------
  1 | package raft
  2 | 
  3 | //
  4 | // support for Raft tester.
  5 | //
  6 | // we will use the original config.go to test your code for grading.
  7 | // so, while you can modify this code to help you debug, please
  8 | // test with the original before submitting.
  9 | //
 10 | 
 11 | import "Mit6824/src/labrpc"
 12 | import "log"
 13 | import "sync"
 14 | import "testing"
 15 | import "runtime"
 16 | import "math/rand"
 17 | import crand "crypto/rand"
 18 | import "math/big"
 19 | import "encoding/base64"
 20 | import "time"
 21 | import "fmt"
 22 | 
 23 | func randstring(n int) string {
 24 | 	b := make([]byte, 2*n)
 25 | 	crand.Read(b)
 26 | 	s := base64.URLEncoding.EncodeToString(b)
 27 | 	return s[0:n]
 28 | }
 29 | 
 30 | func makeSeed() int64 {
 31 | 	max := big.NewInt(int64(1) << 62)
 32 | 	bigx, _ := crand.Int(crand.Reader, max)
 33 | 	x := bigx.Int64()
 34 | 	return x
 35 | }
 36 | 
 37 | type config struct {
 38 | 	mu        sync.Mutex
 39 | 	t         *testing.T
 40 | 	net       *labrpc.Network
 41 | 	n         int
 42 | 	rafts     []*Raft
 43 | 	applyErr  []string // from apply channel readers
 44 | 	connected []bool   // whether each server is on the net
 45 | 	saved     []*Persister
 46 | 	endnames  [][]string    // the port file names each sends to
 47 | 	logs      []map[int]int // copy of each server's committed entries
 48 | 	start     time.Time     // time at which make_config() was called
 49 | 	// begin()/end() statistics
 50 | 	t0        time.Time // time at which test_test.go called cfg.begin()
 51 | 	rpcs0     int       // rpcTotal() at start of test
 52 | 	cmds0     int       // number of agreements
 53 | 	maxIndex  int
 54 | 	maxIndex0 int
 55 | }
 56 | 
 57 | var ncpu_once sync.Once
 58 | 
 59 | func make_config(t *testing.T, n int, unreliable bool) *config {
 60 | 	ncpu_once.Do(func() {
 61 | 		if runtime.NumCPU() < 2 {
 62 | 			fmt.Printf("warning: only one CPU, which may conceal locking bugs\n")
 63 | 		}
 64 | 		rand.Seed(makeSeed())
 65 | 	})
 66 | 	runtime.GOMAXPROCS(4)
 67 | 	cfg := &config{}
 68 | 	cfg.t = t
 69 | 	cfg.net = labrpc.MakeNetwork()
 70 | 	cfg.n = n
 71 | 	cfg.applyErr = make([]string, cfg.n)
 72 | 	cfg.rafts = make([]*Raft, cfg.n)
 73 | 	cfg.connected = make([]bool, cfg.n)
 74 | 	cfg.saved = make([]*Persister, cfg.n)
 75 | 	cfg.endnames = make([][]string, cfg.n)
 76 | 	cfg.logs = make([]map[int]int, cfg.n)
 77 | 	cfg.start = time.Now()
 78 | 
 79 | 	cfg.setunreliable(unreliable)
 80 | 
 81 | 	cfg.net.LongDelays(true)
 82 | 
 83 | 	// create a full set of Rafts.
 84 | 	for i := 0; i < cfg.n; i++ {
 85 | 		cfg.logs[i] = map[int]int{}
 86 | 		cfg.start1(i)
 87 | 	}
 88 | 
 89 | 	// connect everyone
 90 | 	for i := 0; i < cfg.n; i++ {
 91 | 		cfg.connect(i)
 92 | 	}
 93 | 
 94 | 	return cfg
 95 | }
 96 | 
 97 | // shut down a Raft server but save its persistent state.
 98 | func (cfg *config) crash1(i int) {
 99 | 	cfg.disconnect(i)
100 | 	cfg.net.DeleteServer(i) // disable client connections to the server.
101 | 
102 | 	cfg.mu.Lock()
103 | 	defer cfg.mu.Unlock()
104 | 
105 | 	// a fresh persister, in case old instance
106 | 	// continues to update the Persister.
107 | 	// but copy old persister's content so that we always
108 | 	// pass Make() the last persisted state.
109 | 	if cfg.saved[i] != nil {
110 | 		cfg.saved[i] = cfg.saved[i].Copy()
111 | 	}
112 | 
113 | 	rf := cfg.rafts[i]
114 | 	if rf != nil {
115 | 		cfg.mu.Unlock()
116 | 		rf.Kill()
117 | 		cfg.mu.Lock()
118 | 		cfg.rafts[i] = nil
119 | 	}
120 | 
121 | 	if cfg.saved[i] != nil {
122 | 		raftlog := cfg.saved[i].ReadRaftState()
123 | 		cfg.saved[i] = &Persister{}
124 | 		cfg.saved[i].SaveRaftState(raftlog)
125 | 	}
126 | }
127 | 
128 | //
129 | // start or re-start a Raft.
130 | // if one already exists, "kill" it first.
131 | // allocate new outgoing port file names, and a new
132 | // state persister, to isolate previous instance of
133 | // this server. since we cannot really kill it.
134 | //
135 | func (cfg *config) start1(i int) {
136 | 	cfg.crash1(i)
137 | 
138 | 	// a fresh set of outgoing ClientEnd names.
139 | 	// so that old crashed instance's ClientEnds can't send.
140 | 	cfg.endnames[i] = make([]string, cfg.n)
141 | 	for j := 0; j < cfg.n; j++ {
142 | 		cfg.endnames[i][j] = randstring(20)
143 | 	}
144 | 
145 | 	// a fresh set of ClientEnds.
146 | 	ends := make([]*labrpc.ClientEnd, cfg.n)
147 | 	for j := 0; j < cfg.n; j++ {
148 | 		ends[j] = cfg.net.MakeEnd(cfg.endnames[i][j])
149 | 		cfg.net.Connect(cfg.endnames[i][j], j)
150 | 	}
151 | 
152 | 	cfg.mu.Lock()
153 | 
154 | 	// a fresh persister, so old instance doesn't overwrite
155 | 	// new instance's persisted state.
156 | 	// but copy old persister's content so that we always
157 | 	// pass Make() the last persisted state.
158 | 	if cfg.saved[i] != nil {
159 | 		cfg.saved[i] = cfg.saved[i].Copy()
160 | 	} else {
161 | 		cfg.saved[i] = MakePersister()
162 | 	}
163 | 
164 | 	cfg.mu.Unlock()
165 | 
166 | 	// listen to messages from Raft indicating newly committed messages.
167 | 	applyCh := make(chan ApplyMsg)
168 | 	go func() {
169 | 		for m := range applyCh {
170 | 			err_msg := ""
171 | 			if m.CommandValid == false {
172 | 				// ignore other types of ApplyMsg
173 | 			} else if v, ok := (m.Command).(int); ok {
174 | 				cfg.mu.Lock()
175 | 				for j := 0; j < len(cfg.logs); j++ {
176 | 					if old, oldok := cfg.logs[j][m.CommandIndex]; oldok && old != v {
177 | 						// some server has already committed a different value for this entry!
178 | 						err_msg = fmt.Sprintf("commit index=%v server=%v %v != server=%v %v",
179 | 							m.CommandIndex, i, m.Command, j, old)
180 | 					}
181 | 				}
182 | 				_, prevok := cfg.logs[i][m.CommandIndex-1]
183 | 				cfg.logs[i][m.CommandIndex] = v
184 | 				if m.CommandIndex > cfg.maxIndex {
185 | 					cfg.maxIndex = m.CommandIndex
186 | 				}
187 | 				cfg.mu.Unlock()
188 | 
189 | 				if m.CommandIndex > 1 && prevok == false {
190 | 					err_msg = fmt.Sprintf("server %v apply out of order %v", i, m.CommandIndex)
191 | 				}
192 | 			} else {
193 | 				err_msg = fmt.Sprintf("committed command %v is not an int", m.Command)
194 | 			}
195 | 
196 | 			if err_msg != "" {
197 | 				log.Fatalf("apply error: %v\n", err_msg)
198 | 				cfg.applyErr[i] = err_msg
199 | 				// keep reading after error so that Raft doesn't block
200 | 				// holding locks...
201 | 			}
202 | 		}
203 | 	}()
204 | 
205 | 	rf := Make(ends, i, cfg.saved[i], applyCh)
206 | 
207 | 	cfg.mu.Lock()
208 | 	cfg.rafts[i] = rf
209 | 	cfg.mu.Unlock()
210 | 
211 | 	svc := labrpc.MakeService(rf)
212 | 	srv := labrpc.MakeServer()
213 | 	srv.AddService(svc)
214 | 	cfg.net.AddServer(i, srv)
215 | }
216 | 
217 | func (cfg *config) checkTimeout() {
218 | 	// enforce a two minute real-time limit on each test
219 | 	if !cfg.t.Failed() && time.Since(cfg.start) > 120*time.Second {
220 | 		cfg.t.Fatal("test took longer than 120 seconds")
221 | 	}
222 | }
223 | 
224 | func (cfg *config) cleanup() {
225 | 	for i := 0; i < len(cfg.rafts); i++ {
226 | 		if cfg.rafts[i] != nil {
227 | 			cfg.rafts[i].Kill()
228 | 		}
229 | 	}
230 | 	cfg.net.Cleanup()
231 | 	cfg.checkTimeout()
232 | }
233 | 
234 | // attach server i to the net.
235 | func (cfg *config) connect(i int) {
236 | 	// fmt.Printf("connect(%d)\n", i)
237 | 
238 | 	cfg.connected[i] = true
239 | 
240 | 	// outgoing ClientEnds
241 | 	for j := 0; j < cfg.n; j++ {
242 | 		if cfg.connected[j] {
243 | 			endname := cfg.endnames[i][j]
244 | 			cfg.net.Enable(endname, true)
245 | 		}
246 | 	}
247 | 
248 | 	// incoming ClientEnds
249 | 	for j := 0; j < cfg.n; j++ {
250 | 		if cfg.connected[j] {
251 | 			endname := cfg.endnames[j][i]
252 | 			cfg.net.Enable(endname, true)
253 | 		}
254 | 	}
255 | }
256 | 
257 | // detach server i from the net.
258 | func (cfg *config) disconnect(i int) {
259 | 	// fmt.Printf("disconnect(%d)\n", i)
260 | 
261 | 	cfg.connected[i] = false
262 | 
263 | 	// outgoing ClientEnds
264 | 	for j := 0; j < cfg.n; j++ {
265 | 		if cfg.endnames[i] != nil {
266 | 			endname := cfg.endnames[i][j]
267 | 			cfg.net.Enable(endname, false)
268 | 		}
269 | 	}
270 | 
271 | 	// incoming ClientEnds
272 | 	for j := 0; j < cfg.n; j++ {
273 | 		if cfg.endnames[j] != nil {
274 | 			endname := cfg.endnames[j][i]
275 | 			cfg.net.Enable(endname, false)
276 | 		}
277 | 	}
278 | }
279 | 
280 | func (cfg *config) rpcCount(server int) int {
281 | 	return cfg.net.GetCount(server)
282 | }
283 | 
284 | func (cfg *config) rpcTotal() int {
285 | 	return cfg.net.GetTotalCount()
286 | }
287 | 
288 | func (cfg *config) setunreliable(unrel bool) {
289 | 	cfg.net.Reliable(!unrel)
290 | }
291 | 
292 | func (cfg *config) setlongreordering(longrel bool) {
293 | 	cfg.net.LongReordering(longrel)
294 | }
295 | 
296 | // check that there's exactly one leader.
297 | // try a few times in case re-elections are needed.
298 | func (cfg *config) checkOneLeader() int {
299 | 	for iters := 0; iters < 10; iters++ {
300 | 		ms := 450 + (rand.Int63() % 100)
301 | 		time.Sleep(time.Duration(ms) * time.Millisecond)
302 | 
303 | 		leaders := make(map[int][]int)
304 | 		for i := 0; i < cfg.n; i++ {
305 | 			if cfg.connected[i] {
306 | 				if term, leader := cfg.rafts[i].GetState(); leader {
307 | 					leaders[term] = append(leaders[term], i)
308 | 				}
309 | 			}
310 | 		}
311 | 
312 | 		lastTermWithLeader := -1
313 | 		for term, leaders := range leaders {
314 | 			if len(leaders) > 1 {
315 | 				cfg.t.Fatalf("term %d has %d (>1) leaders", term, len(leaders))
316 | 			}
317 | 			if term > lastTermWithLeader {
318 | 				lastTermWithLeader = term
319 | 			}
320 | 		}
321 | 
322 | 		if len(leaders) != 0 {
323 | 			return leaders[lastTermWithLeader][0]
324 | 		}
325 | 	}
326 | 	cfg.t.Fatalf("expected one leader, got none")
327 | 	return -1
328 | }
329 | 
330 | // check that everyone agrees on the term.
331 | func (cfg *config) checkTerms() int {
332 | 	term := -1
333 | 	for i := 0; i < cfg.n; i++ {
334 | 		if cfg.connected[i] {
335 | 			xterm, _ := cfg.rafts[i].GetState()
336 | 			if term == -1 {
337 | 				term = xterm
338 | 			} else if term != xterm {
339 | 				cfg.t.Fatalf("servers disagree on term")
340 | 			}
341 | 		}
342 | 	}
343 | 	return term
344 | }
345 | 
346 | // check that there's no leader
347 | func (cfg *config) checkNoLeader() {
348 | 	for i := 0; i < cfg.n; i++ {
349 | 		if cfg.connected[i] {
350 | 			_, is_leader := cfg.rafts[i].GetState()
351 | 			if is_leader {
352 | 				cfg.t.Fatalf("expected no leader, but %v claims to be leader", i)
353 | 			}
354 | 		}
355 | 	}
356 | }
357 | 
358 | // how many servers think a log entry is committed?
359 | func (cfg *config) nCommitted(index int) (int, interface{}) {
360 | 	count := 0
361 | 	cmd := -1
362 | 	for i := 0; i < len(cfg.rafts); i++ {
363 | 		if cfg.applyErr[i] != "" {
364 | 			cfg.t.Fatal(cfg.applyErr[i])
365 | 		}
366 | 
367 | 		cfg.mu.Lock()
368 | 		cmd1, ok := cfg.logs[i][index]
369 | 		cfg.mu.Unlock()
370 | 
371 | 		if ok {
372 | 			if count > 0 && cmd != cmd1 {
373 | 				cfg.t.Fatalf("committed values do not match: index %v, %v, %v\n",
374 | 					index, cmd, cmd1)
375 | 			}
376 | 			count += 1
377 | 			cmd = cmd1
378 | 		}
379 | 	}
380 | 	return count, cmd
381 | }
382 | 
383 | // wait for at least n servers to commit.
384 | // but don't wait forever.
385 | func (cfg *config) wait(index int, n int, startTerm int) interface{} {
386 | 	to := 10 * time.Millisecond
387 | 	for iters := 0; iters < 30; iters++ {
388 | 		nd, _ := cfg.nCommitted(index)
389 | 		if nd >= n {
390 | 			break
391 | 		}
392 | 		time.Sleep(to)
393 | 		if to < time.Second {
394 | 			to *= 2
395 | 		}
396 | 		if startTerm > -1 {
397 | 			for _, r := range cfg.rafts {
398 | 				if t, _ := r.GetState(); t > startTerm {
399 | 					// someone has moved on
400 | 					// can no longer guarantee that we'll "win"
401 | 					return -1
402 | 				}
403 | 			}
404 | 		}
405 | 	}
406 | 	nd, cmd := cfg.nCommitted(index)
407 | 	if nd < n {
408 | 		cfg.t.Fatalf("only %d decided for index %d; wanted %d\n",
409 | 			nd, index, n)
410 | 	}
411 | 	return cmd
412 | }
413 | 
414 | // do a complete agreement.
415 | // it might choose the wrong leader initially,
416 | // and have to re-submit after giving up.
417 | // entirely gives up after about 10 seconds.
418 | // indirectly checks that the servers agree on the
419 | // same value, since nCommitted() checks this,
420 | // as do the threads that read from applyCh.
421 | // returns index.
422 | // if retry==true, may submit the command multiple
423 | // times, in case a leader fails just after Start().
424 | // if retry==false, calls Start() only once, in order
425 | // to simplify the early Lab 2B tests.
426 | func (cfg *config) one(cmd int, expectedServers int, retry bool) int {
427 | 	t0 := time.Now()
428 | 	starts := 0
429 | 	for time.Since(t0).Seconds() < 10 {
430 | 		// try all the servers, maybe one is the leader.
431 | 		index := -1
432 | 		for si := 0; si < cfg.n; si++ {
433 | 			starts = (starts + 1) % cfg.n
434 | 			var rf *Raft
435 | 			cfg.mu.Lock()
436 | 			if cfg.connected[starts] {
437 | 				rf = cfg.rafts[starts]
438 | 			}
439 | 			cfg.mu.Unlock()
440 | 			if rf != nil {
441 | 				index1, _, ok := rf.Start(cmd)
442 | 				if ok {
443 | 					index = index1
444 | 					break
445 | 				}
446 | 			}
447 | 		}
448 | 
449 | 		if index != -1 {
450 | 			// somebody claimed to be the leader and to have
451 | 			// submitted our command; wait a while for agreement.
452 | 			t1 := time.Now()
453 | 			for time.Since(t1).Seconds() < 2 {
454 | 				nd, cmd1 := cfg.nCommitted(index)
455 | 				if nd > 0 && nd >= expectedServers {
456 | 					// committed
457 | 					if cmd2, ok := cmd1.(int); ok && cmd2 == cmd {
458 | 						// and it was the command we submitted.
459 | 						return index
460 | 					}
461 | 				}
462 | 				time.Sleep(20 * time.Millisecond)
463 | 			}
464 | 			if retry == false {
465 | 				cfg.t.Fatalf("one(%v) failed to reach agreement", cmd)
466 | 			}
467 | 		} else {
468 | 			time.Sleep(50 * time.Millisecond)
469 | 		}
470 | 	}
471 | 	cfg.t.Fatalf("one(%v) failed to reach agreement", cmd)
472 | 	return -1
473 | }
474 | 
475 | // start a Test.
476 | // print the Test message.
477 | // e.g. cfg.begin("Test (2B): RPC counts aren't too high")
478 | func (cfg *config) begin(description string) {
479 | 	fmt.Printf("%s ...\n", description)
480 | 	cfg.t0 = time.Now()
481 | 	cfg.rpcs0 = cfg.rpcTotal()
482 | 	cfg.cmds0 = 0
483 | 	cfg.maxIndex0 = cfg.maxIndex
484 | }
485 | 
486 | // end a Test -- the fact that we got here means there
487 | // was no failure.
488 | // print the Passed message,
489 | // and some performance numbers.
490 | func (cfg *config) end() {
491 | 	cfg.checkTimeout()
492 | 	if cfg.t.Failed() == false {
493 | 		cfg.mu.Lock()
494 | 		t := time.Since(cfg.t0).Seconds()     // real time
495 | 		npeers := cfg.n                       // number of Raft peers
496 | 		nrpc := cfg.rpcTotal() - cfg.rpcs0    // number of RPC sends
497 | 		ncmds := cfg.maxIndex - cfg.maxIndex0 // number of Raft agreements reported
498 | 		cfg.mu.Unlock()
499 | 
500 | 		fmt.Printf("  ... Passed --")
501 | 		fmt.Printf("  %4.1f  %d %4d %4d\n", t, npeers, nrpc, ncmds)
502 | 	}
503 | }
504 | 


--------------------------------------------------------------------------------
/src/raft/persister.go:
--------------------------------------------------------------------------------
 1 | package raft
 2 | 
 3 | //
 4 | // support for Raft and kvraft to save persistent
 5 | // Raft state (log &c) and k/v server snapshots.
 6 | //
 7 | // we will use the original persister.go to test your code for grading.
 8 | // so, while you can modify this code to help you debug, please
 9 | // test with the original before submitting.
10 | //
11 | 
12 | import "sync"
13 | 
14 | type Persister struct {
15 | 	mu        sync.Mutex
16 | 	raftstate []byte
17 | 	snapshot  []byte
18 | }
19 | 
20 | func MakePersister() *Persister {
21 | 	return &Persister{}
22 | }
23 | 
24 | func (ps *Persister) Copy() *Persister {
25 | 	ps.mu.Lock()
26 | 	defer ps.mu.Unlock()
27 | 	np := MakePersister()
28 | 	np.raftstate = ps.raftstate
29 | 	np.snapshot = ps.snapshot
30 | 	return np
31 | }
32 | 
33 | func (ps *Persister) SaveRaftState(state []byte) {
34 | 	ps.mu.Lock()
35 | 	defer ps.mu.Unlock()
36 | 	ps.raftstate = state
37 | }
38 | 
39 | func (ps *Persister) ReadRaftState() []byte {
40 | 	ps.mu.Lock()
41 | 	defer ps.mu.Unlock()
42 | 	return ps.raftstate
43 | }
44 | 
45 | func (ps *Persister) RaftStateSize() int {
46 | 	ps.mu.Lock()
47 | 	defer ps.mu.Unlock()
48 | 	return len(ps.raftstate)
49 | }
50 | 
51 | // Save both Raft state and K/V snapshot as a single atomic action,
52 | // to help avoid them getting out of sync.
53 | func (ps *Persister) SaveStateAndSnapshot(state []byte, snapshot []byte) {
54 | 	ps.mu.Lock()
55 | 	defer ps.mu.Unlock()
56 | 	ps.raftstate = state
57 | 	ps.snapshot = snapshot
58 | }
59 | 
60 | func (ps *Persister) ReadSnapshot() []byte {
61 | 	ps.mu.Lock()
62 | 	defer ps.mu.Unlock()
63 | 	return ps.snapshot
64 | }
65 | 
66 | func (ps *Persister) SnapshotSize() int {
67 | 	ps.mu.Lock()
68 | 	defer ps.mu.Unlock()
69 | 	return len(ps.snapshot)
70 | }
71 | 


--------------------------------------------------------------------------------
/src/raft/raft.go:
--------------------------------------------------------------------------------
  1 | package raft
  2 | 
  3 | //
  4 | // this is an outline of the API that raft must expose to
  5 | // the service (or tester). see comments below for
  6 | // each of these functions for more details.
  7 | //
  8 | // rf = Make(...)
  9 | //   create a new Raft server.
 10 | // rf.Start(command interface{}) (index, term, isleader)
 11 | //   start agreement on a new log entry
 12 | // rf.GetState() (term, isLeader)
 13 | //   ask a Raft for its current term, and whether it thinks it is leader
 14 | // ApplyMsg
 15 | //   each time a new entry is committed to the log, each Raft peer
 16 | //   should send an ApplyMsg to the service (or tester)
 17 | //   in the same server.
 18 | //
 19 | 
 20 | import (
 21 | 	"fmt"
 22 | 	"math/rand"
 23 | 	"strconv"
 24 | 	"sync"
 25 | 	"time"
 26 | )
 27 | import "sync/atomic"
 28 | import "Mit6824/src/labrpc"
 29 | 
 30 | // import "bytes"
 31 | // import "labgob"
 32 | 
 33 | //
 34 | // as each Raft peer becomes aware that successive log entries are
 35 | // committed, the peer should send an ApplyMsg to the service (or
 36 | // tester) on the same server, via the applyCh passed to Make(). set
 37 | // CommandValid to true to indicate that the ApplyMsg contains a newly
 38 | // committed log entry.
 39 | //
 40 | // in Lab 3 you'll want to send other kinds of messages (e.g.,
 41 | // snapshots) on the applyCh; at that point you can add fields to
 42 | // ApplyMsg, but set CommandValid to false for these other uses.
 43 | //
 44 | type ApplyMsg struct {
 45 | 	CommandValid bool
 46 | 	Command      interface{}
 47 | 	CommandIndex int
 48 | }
 49 | 
 50 | //
 51 | // A Go object implementing a single Raft peer.
 52 | //
 53 | // TODO: 对raft 各种关键行为的操作 没有保证原子性
 54 | type Raft struct {
 55 | 	// TODO: 这里可以该成读写锁 进一步优化
 56 | 	mu        sync.Mutex          // Lock to protect shared access to this peer's state
 57 | 	peers     []*labrpc.ClientEnd // RPC end points of all peers
 58 | 	persister *Persister          // Object to hold this peer's persisted state
 59 | 	me        int                 // this peer's index into peers[]
 60 | 	dead      int32               // set by Kill()
 61 | 	closeChan chan struct{}       // 关闭信号
 62 | 	// Your data here (2A, 2B, 2C).
 63 | 	// Look at the paper's Figure 2 for a description of what
 64 | 	// state a Raft server must maintain.
 65 | 	// 唯一 持久化的配置
 66 | 	roles         int32        // 标示当前对等点当前的角色 1 跟随者 2 候选人 3 领导者
 67 | 	id            atomic.Value // 对等点唯一标示
 68 | 	curVoteTarget atomic.Value // 当一次投票给node的ID
 69 | 	myTerm        int64        // 最后的已知的任期
 70 | 	// TODO: 是否存在 一个并发安全的 list?
 71 | 	logs          []Wal       // 日志条目
 72 | 	lastApplyIdx  int         // 最后应用于状态机的日志索引
 73 | 	lastCommitIdx int         // 最后的提交日志索引
 74 | 	electionTimer *time.Timer // 用于选举的定时器
 75 | }
 76 | 
 77 | func (rf *Raft) initLeader() {
 78 | 	// TODO: 初始化 leader 相关数据状态
 79 | 	rf.setCurVoteTarget(rf.getId())
 80 | }
 81 | func (rf *Raft) isLeader() bool {
 82 | 	return atomic.CompareAndSwapInt32(&rf.roles, 3, 3)
 83 | }
 84 | 
 85 | // 只有从一个候选人才能变更为领导者
 86 | func (rf *Raft) coronation() bool {
 87 | 	return atomic.CompareAndSwapInt32(&rf.roles, 2, 3)
 88 | }
 89 | 
 90 | func (rf *Raft) isFollower() bool {
 91 | 	return atomic.LoadInt32(&rf.roles) == 1
 92 | }
 93 | 
 94 | func (rf *Raft) following() {
 95 | 	atomic.StoreInt32(&rf.roles, 1)
 96 | }
 97 | 
 98 | func (rf *Raft) isCandidate() bool {
 99 | 	return atomic.LoadInt32(&rf.roles) == 2
100 | }
101 | 
102 | // 只能从 跟随者 变为候选人
103 | func (rf *Raft) setCandidate() bool {
104 | 	return atomic.CompareAndSwapInt32(&rf.roles, 1, 2)
105 | }
106 | 
107 | func (rf *Raft) getLastCommitIdx() int {
108 | 	//rf.mu.Lock()
109 | 	//defer rf.mu.Unlock()
110 | 	return rf.lastCommitIdx
111 | }
112 | 
113 | func (rf *Raft) setLogs(los []Wal) {
114 | 	//rf.mu.Lock()
115 | 	//defer rf.mu.Unlock()
116 | 	rf.logs = los
117 | }
118 | func (rf *Raft) setMyTerm(term int64) {
119 | 	atomic.StoreInt64(&rf.myTerm, term)
120 | }
121 | 
122 | func (rf *Raft) getCurVoteTarget() string {
123 | 	return rf.curVoteTarget.Load().(string)
124 | }
125 | func (rf *Raft) setCurVoteTarget(vote string) {
126 | 	rf.curVoteTarget.Store(vote)
127 | }
128 | 
129 | func (rf *Raft) setLastCommitIdx(lastCommitIdx int) {
130 | 	//rf.mu.Lock()
131 | 	//defer rf.mu.Unlock()
132 | 	rf.lastCommitIdx = lastCommitIdx
133 | }
134 | 
135 | func (rf *Raft) getMyTerm() int64 {
136 | 	return atomic.LoadInt64(&rf.myTerm)
137 | }
138 | func (rf *Raft) incrMyTerm() int64 {
139 | 	return atomic.AddInt64(&rf.myTerm, 1)
140 | }
141 | func (rf *Raft) getMe() int {
142 | 	//rf.mu.Lock()
143 | 	//defer rf.mu.Unlock()
144 | 	return rf.me
145 | }
146 | 
147 | func (rf *Raft) getId() string {
148 | 	return rf.id.Load().(string)
149 | }
150 | 
151 | func (rf *Raft) setId(id string) {
152 | 	rf.id.Store(id)
153 | }
154 | 
155 | type Wal struct {
156 | 	term int64
157 | 	cmd  string
158 | }
159 | 
160 | // return currentTerm and whether this server
161 | // believes it is the leader.
162 | func (rf *Raft) GetState() (int, bool) {
163 | 	// Your code here (2A).
164 | 	return int(rf.getMyTerm()), rf.isLeader()
165 | }
166 | 
167 | //
168 | // save Raft's persistent state to stable storage,
169 | // where it can later be retrieved after a crash and restart.
170 | // see paper's Figure 2 for a description of what should be persistent.
171 | //
172 | func (rf *Raft) persist() {
173 | 	// Your code here (2C).
174 | 	// Example:
175 | 	// w := new(bytes.Buffer)
176 | 	// e := labgob.NewEncoder(w)
177 | 	// e.Encode(rf.xxx)
178 | 	// e.Encode(rf.yyy)
179 | 	// data := w.Bytes()
180 | 	// rf.persister.SaveRaftState(data)
181 | }
182 | 
183 | //
184 | // restore previously persisted state.
185 | //
186 | func (rf *Raft) readPersist(data []byte) {
187 | 	if data == nil || len(data) < 1 { // bootstrap without any state?
188 | 		return
189 | 	}
190 | 	// Your code here (2C).
191 | 	// Example:
192 | 	// r := bytes.NewBuffer(data)
193 | 	// d := labgob.NewDecoder(r)
194 | 	// var xxx
195 | 	// var yyy
196 | 	// if d.Decode(&xxx) != nil ||
197 | 	//    d.Decode(&yyy) != nil {
198 | 	//   error...
199 | 	// } else {
200 | 	//   rf.xxx = xxx
201 | 	//   rf.yyy = yyy
202 | 	// }
203 | }
204 | 
205 | //
206 | // example RequestVote RPC arguments structure.
207 | // field names must start with capital letters!
208 | //
209 | type RequestVoteArgs struct {
210 | 	// Your data here (2A, 2B).
211 | 	CandidateID   string
212 | 	CandidateTerm int64
213 | 	LastLogIdx    int
214 | 	LastLogTerm   int64
215 | }
216 | 
217 | //
218 | // example RequestVote RPC reply structure.
219 | // field names must start with capital letters!
220 | //
221 | type RequestVoteReply struct {
222 | 	// Your data here (2A).
223 | 	CurTerm int64
224 | 	IsVote  bool
225 | }
226 | 
227 | //
228 | // example RequestVote RPC handler.
229 | //
230 | // TODO: 这里可能存在 选票瓜分时 同时竞选成为领导的问题 草你妈的  这一周也没搞定 气死哎呀
231 | func (rf *Raft) RequestVote(args *RequestVoteArgs, reply *RequestVoteReply) {
232 | 	// Your code here (2A, 2B).
233 | 	//curVoteTarget := rf.getCurVoteTarget()
234 | 	rf.mu.Lock()
235 | 	defer rf.mu.Unlock()
236 | 
237 | 	reply.CurTerm = rf.getMyTerm()
238 | 	// 如果相同 那么说明同时超时 彼此肯定不投票 选票被瓜分
239 | 	if args.CandidateTerm <= reply.CurTerm {
240 | 		reply.IsVote = false
241 | 		return
242 | 	}
243 | 	var lastLog Wal
244 | 	var lastLogIdx int
245 | 	// TODO: 对获取最后的日志这里应该进行抽象
246 | 	if len(rf.logs) > 0 {
247 | 		lastLog = rf.logs[len(rf.logs)-1]
248 | 		lastLogIdx = len(rf.logs) - 1
249 | 	}
250 | 	//rf.mu.Unlock()
251 | 	if lastLog.term > args.CandidateTerm /*|| len(curVoteTarget) != 0*/ || lastLogIdx > args.LastLogIdx {
252 | 		reply.IsVote = false
253 | 		return
254 | 	}
255 | 	rf.setMyTerm(args.CandidateTerm)
256 | 	rf.setCurVoteTarget(args.CandidateID)
257 | 	rf.following()
258 | 	// 在投票后重新等待一个选举超时时间,也就是说 选票会抑制跟随者成为候选者,如果节点投票相当于放弃了最近一次的竞选
259 | 	fmt.Println(rf.getId(), "在任期", reply.CurTerm, "投票给", args.CandidateID, "后任期变为", rf.getMyTerm())
260 | 	rf.electionTimer.Reset(getElectionTimeOut())
261 | 	reply.IsVote = true
262 | 	return
263 | }
264 | 
265 | //
266 | // example code to send a RequestVote RPC to a server.
267 | // server is the index of the target server in rf.peers[].
268 | // expects RPC arguments in args.
269 | // fills in *reply with RPC reply, so caller should
270 | // pass &reply.
271 | // the types of the args and reply passed to Call() must be
272 | // the same as the types of the arguments declared in the
273 | // handler function (including whether they are pointers).
274 | //
275 | // The labrpc package simulates a lossy network, in which servers
276 | // may be unreachable, and in which requests and replies may be lost.
277 | // Call() sends a request and waits for a reply. If a reply arrives
278 | // within a timeout interval, Call() returns true; otherwise
279 | // Call() returns false. Thus Call() may not return for a while.
280 | // A false return can be caused by a dead server, a live server that
281 | // can't be reached, a lost request, or a lost reply.
282 | //
283 | // Call() is guaranteed to return (perhaps after a delay) *except* if the
284 | // handler function on the server side does not return.  Thus there
285 | // is no need to implement your own timeouts around Call().
286 | //
287 | // look at the comments in ../labrpc/labrpc.go for more details.
288 | //
289 | // if you're having trouble getting RPC to work, check that you've
290 | // capitalized all field names in structs passed over RPC, and
291 | // that the caller passes the address of the reply struct with &, not
292 | // the struct itself.
293 | //
294 | func (rf *Raft) sendRequestVote(server int, args *RequestVoteArgs, reply *RequestVoteReply) bool {
295 | 	ok := rf.peers[server].Call("Raft.RequestVote", args, reply)
296 | 	return ok
297 | }
298 | 
299 | //
300 | // 附加日志/心跳rpc 请求
301 | //
302 | type AppendEntriesArgs struct {
303 | 	// Your data here (2A, 2B).
304 | 	LeaderID   string
305 | 	LeaderTerm int64
306 | 	PreLogIdx  int
307 | 	PreLogTerm int64
308 | 	LastCommit int
309 | 	Logs       []interface{}
310 | }
311 | 
312 | //
313 | // 附加日志/心跳 rpc 的回复
314 | //
315 | type AppendEntriesReply struct {
316 | 	// Your data here (2A).
317 | 	CurrTerm int64
318 | 	IsOK     bool
319 | }
320 | 
321 | //
322 | // 附加日志/心跳 rpc 的执行体
323 | //
324 | func (rf *Raft) RequestAppendEntries(args *AppendEntriesArgs, reply *AppendEntriesReply) {
325 | 	// Your code here (2A, 2B).
326 | 	rf.mu.Lock()
327 | 	defer rf.mu.Unlock()
328 | 	myTerm := rf.getMyTerm()
329 | 	reply.CurrTerm = myTerm
330 | 	if args.LeaderTerm < myTerm {
331 | 		reply.IsOK = false
332 | 		return
333 | 	}
334 | 	if len(args.Logs) == 0 && args.LeaderTerm >= myTerm {
335 | 		// 如果 请求的任期更高 那么就更新自己认为的leader节点
336 | 		if args.LeaderTerm > myTerm {
337 | 			rf.following()
338 | 			rf.setCurVoteTarget(args.LeaderID)
339 | 			rf.setMyTerm(args.LeaderTerm)
340 | 		}
341 | 		reply.IsOK = true
342 | 		rf.electionTimer.Reset(getElectionTimeOut())
343 | 		return
344 | 	}
345 | }
346 | 
347 | // 附加日志/心跳 rpc 的发送函数
348 | func (rf *Raft) sendAppendEntries(server int, args *AppendEntriesArgs, reply *AppendEntriesReply) bool {
349 | 	rf.mu.Lock()
350 | 	defer rf.mu.Unlock()
351 | 	ok := rf.peers[server].Call("Raft.RequestAppendEntries", args, reply)
352 | 	return ok
353 | }
354 | 
355 | //
356 | // the service using Raft (e.g. a k/v server) wants to start
357 | // agreement on the next command to be appended to Raft's log. if this
358 | // server isn't the leader, returns false. otherwise start the
359 | // agreement and return immediately. there is no guarantee that this
360 | // command will ever be committed to the Raft log, since the leader
361 | // may fail or lose an election. even if the Raft instance has been killed,
362 | // this function should return gracefully.
363 | //
364 | // the first return value is the index that the command will appear at
365 | // if it's ever committed. the second return value is the current
366 | // term. the third return value is true if this server believes it is
367 | // the leader.
368 | //
369 | func (rf *Raft) Start(command interface{}) (int, int, bool) {
370 | 	index := -1
371 | 	term := -1
372 | 	isLeader := true
373 | 
374 | 	// Your code here (2B).
375 | 
376 | 	return index, term, isLeader
377 | }
378 | 
379 | //
380 | // the tester calls Kill() when a Raft instance won't
381 | // be needed again. for your convenience, we supply
382 | // code to set rf.dead (without needing a lock),
383 | // and a killed() method to test rf.dead in
384 | // long-running loops. you can also add your own
385 | // code to Kill(). you're not required to do anything
386 | // about this, but it may be convenient (for example)
387 | // to suppress debug output from a Kill()ed instance.
388 | //
389 | func (rf *Raft) Kill() {
390 | 	atomic.StoreInt32(&rf.dead, 1)
391 | 	// Your code here, if desired.
392 | }
393 | 
394 | func (rf *Raft) killed() bool {
395 | 	z := atomic.LoadInt32(&rf.dead)
396 | 	return z == 1
397 | }
398 | 
399 | //
400 | // the service or tester wants to create a Raft server. the ports
401 | // of all the Raft servers (including this one) are in peers[]. this
402 | // server's port is peers[me]. all the servers' peers[] arrays
403 | // have the same order. persister is a place for this server to
404 | // save its persistent state, and also initially holds the most
405 | // recent saved state, if any. applyCh is a channel on which the
406 | // tester or service expects Raft to send ApplyMsg messages.
407 | // Make() must return quickly, so it should start goroutines
408 | // for any long-running work.
409 | //
410 | func Make(peers []*labrpc.ClientEnd, me int,
411 | 	persister *Persister, applyCh chan ApplyMsg) *Raft {
412 | 	rf := &Raft{}
413 | 	rf.peers = peers
414 | 	rf.persister = persister
415 | 	rf.me = me
416 | 	rf.closeChan = make(chan struct{})
417 | 	// Your initialization code here (2A, 2B, 2C).
418 | 	rf.setId(strconv.Itoa(me))            // TODO: 应该去除特殊字符
419 | 	rf.setCurVoteTarget(strconv.Itoa(me)) // TODO: 应先从持久化数据中恢复
420 | 	rf.logs = make([]Wal, 0)
421 | 	rf.myTerm = 1 // 初始化的时候 大家都认为自己是1,除非 被快照覆盖
422 | 	rf.following()
423 | 	go rf.election()
424 | 	go rf.heartbeat()
425 | 	// initialize from state persisted before a crash
426 | 	rf.readPersist(persister.ReadRaftState())
427 | 
428 | 	return rf
429 | }
430 | func getElectionTimeOut() time.Duration {
431 | 	rand.Seed(time.Now().UnixNano())
432 | 	ts := time.Duration(300+(rand.Int63()%200)) * time.Millisecond
433 | 	return ts
434 | }
435 | 
436 | func (rf *Raft) election() {
437 | 	rf.electionTimer = time.NewTimer(getElectionTimeOut())
438 | 	defer rf.electionTimer.Stop()
439 | 	for {
440 | 		select {
441 | 		case <-rf.electionTimer.C:
442 | 			if !rf.setCandidate() { //成为候选人
443 | 				continue
444 | 			}
445 | 			rf.incrMyTerm()
446 | 			rf.setCurVoteTarget("")
447 | 			myTerm := rf.getMyTerm()
448 | 			me := rf.me
449 | 			var lastLogIdx int
450 | 			var lastLogTerm int64
451 | 			peersLen := len(rf.peers)
452 | 			if len(rf.logs) > 0 {
453 | 				lastLogIdx, lastLogTerm = len(rf.logs)-1, rf.logs[len(rf.logs)-1].term
454 | 			}
455 | 			voteArgs, voteRes := RequestVoteArgs{
456 | 				CandidateID:   rf.getId(),
457 | 				CandidateTerm: myTerm,
458 | 				LastLogIdx:    lastLogIdx,
459 | 				LastLogTerm:   lastLogTerm,
460 | 			}, RequestVoteReply{}
461 | 			res := make([]bool, peersLen)
462 | 			res[me] = true // 候选人会投自己一票
463 | 			for i := 0; i < peersLen; i++ {
464 | 				if i == me {
465 | 					continue
466 | 				}
467 | 				if ok := rf.sendRequestVote(i, &voteArgs, &voteRes); ok {
468 | 					if voteRes.CurTerm > myTerm {
469 | 						// 如果 他认为的民众比他任期更高 那么他就回退到跟随者
470 | 						rf.setMyTerm(voteRes.CurTerm)
471 | 						rf.following()
472 | 						goto f
473 | 					}
474 | 					res[i] = voteRes.IsVote
475 | 				}
476 | 			}
477 | 			count := 0
478 | 			for _, v := range res {
479 | 				if v {
480 | 					count++
481 | 				}
482 | 			}
483 | 			if count >= (peersLen)/2+1 {
484 | 				for !rf.coronation() {
485 | 				} // CAS
486 | 				fmt.Println(rf.getId(), "在任期", rf.getMyTerm(), "成为领导者", "获得选票", res, "状态是", rf.roles)
487 | 				rf.initLeader()
488 | 			}
489 | 		case <-rf.closeChan:
490 | 			return
491 | 		}
492 | 	f:
493 | 		rf.electionTimer.Reset(getElectionTimeOut())
494 | 	}
495 | }
496 | 
497 | // TODO:expected one leader, got none  无法在 5秒内选举出leader 明天 还是去看看论文吧  这个时候 我怀疑自己的理解有问题了 草 还是不能通过 哭唧唧
498 | func (rf *Raft) heartbeat() {
499 | 	c := time.NewTicker(100 * time.Millisecond)
500 | 	for {
501 | 		select {
502 | 		case <-c.C:
503 | 			if rf.isLeader() {
504 | 				id := rf.getId()
505 | 				myTerm := rf.getMyTerm()
506 | 				peersLen := len(rf.peers)
507 | 				me := rf.me
508 | 				res := make([]bool, len(rf.peers))
509 | 				res[me] = true
510 | 				for i := 0; i < peersLen; i++ {
511 | 					if i == me {
512 | 						continue
513 | 					}
514 | 					reply := &AppendEntriesReply{}
515 | 					ret := rf.sendAppendEntries(i, &AppendEntriesArgs{
516 | 						LeaderID:   id,
517 | 						LeaderTerm: myTerm,
518 | 						Logs:       make([]interface{}, 0),
519 | 					}, reply)
520 | 					res[i] = ret
521 | 					if ret && reply.CurrTerm > myTerm {
522 | 						rf.setMyTerm(reply.CurrTerm)
523 | 						rf.following()
524 | 						rf.electionTimer.Reset(getElectionTimeOut())
525 | 						break
526 | 					}
527 | 				}
528 | 				count := 0
529 | 				for _, v := range res {
530 | 					if v {
531 | 						count++
532 | 					}
533 | 				}
534 | 				if count < (peersLen)/2+1 {
535 | 					rf.following()
536 | 					rf.electionTimer.Reset(getElectionTimeOut())
537 | 				}
538 | 			}
539 | 		case <-rf.closeChan:
540 | 			return
541 | 		}
542 | 	}
543 | }
544 | 


--------------------------------------------------------------------------------
/src/raft/util.go:
--------------------------------------------------------------------------------
 1 | package raft
 2 | 
 3 | import "log"
 4 | 
 5 | // Debugging
 6 | const Debug = 0
 7 | 
 8 | func DPrintf(format string, a ...interface{}) (n int, err error) {
 9 | 	if Debug > 0 {
10 | 		log.Printf(format, a...)
11 | 	}
12 | 	return
13 | }
14 | 


--------------------------------------------------------------------------------
/src/shardkv/client.go:
--------------------------------------------------------------------------------
  1 | package shardkv
  2 | 
  3 | //
  4 | // client code to talk to a sharded key/value service.
  5 | //
  6 | // the client first talks to the shardmaster to find out
  7 | // the assignment of shards (keys) to groups, and then
  8 | // talks to the group that holds the key's shard.
  9 | //
 10 | 
 11 | import "labrpc"
 12 | import "crypto/rand"
 13 | import "math/big"
 14 | import "shardmaster"
 15 | import "time"
 16 | 
 17 | //
 18 | // which shard is a key in?
 19 | // please use this function,
 20 | // and please do not change it.
 21 | //
 22 | func key2shard(key string) int {
 23 | 	shard := 0
 24 | 	if len(key) > 0 {
 25 | 		shard = int(key[0])
 26 | 	}
 27 | 	shard %= shardmaster.NShards
 28 | 	return shard
 29 | }
 30 | 
 31 | func nrand() int64 {
 32 | 	max := big.NewInt(int64(1) << 62)
 33 | 	bigx, _ := rand.Int(rand.Reader, max)
 34 | 	x := bigx.Int64()
 35 | 	return x
 36 | }
 37 | 
 38 | type Clerk struct {
 39 | 	sm       *shardmaster.Clerk
 40 | 	config   shardmaster.Config
 41 | 	make_end func(string) *labrpc.ClientEnd
 42 | 	// You will have to modify this struct.
 43 | }
 44 | 
 45 | //
 46 | // the tester calls MakeClerk.
 47 | //
 48 | // masters[] is needed to call shardmaster.MakeClerk().
 49 | //
 50 | // make_end(servername) turns a server name from a
 51 | // Config.Groups[gid][i] into a labrpc.ClientEnd on which you can
 52 | // send RPCs.
 53 | //
 54 | func MakeClerk(masters []*labrpc.ClientEnd, make_end func(string) *labrpc.ClientEnd) *Clerk {
 55 | 	ck := new(Clerk)
 56 | 	ck.sm = shardmaster.MakeClerk(masters)
 57 | 	ck.make_end = make_end
 58 | 	// You'll have to add code here.
 59 | 	return ck
 60 | }
 61 | 
 62 | //
 63 | // fetch the current value for a key.
 64 | // returns "" if the key does not exist.
 65 | // keeps trying forever in the face of all other errors.
 66 | // You will have to modify this function.
 67 | //
 68 | func (ck *Clerk) Get(key string) string {
 69 | 	args := GetArgs{}
 70 | 	args.Key = key
 71 | 
 72 | 	for {
 73 | 		shard := key2shard(key)
 74 | 		gid := ck.config.Shards[shard]
 75 | 		if servers, ok := ck.config.Groups[gid]; ok {
 76 | 			// try each server for the shard.
 77 | 			for si := 0; si < len(servers); si++ {
 78 | 				srv := ck.make_end(servers[si])
 79 | 				var reply GetReply
 80 | 				ok := srv.Call("ShardKV.Get", &args, &reply)
 81 | 				if ok && (reply.Err == OK || reply.Err == ErrNoKey) {
 82 | 					return reply.Value
 83 | 				}
 84 | 				if ok && (reply.Err == ErrWrongGroup) {
 85 | 					break
 86 | 				}
 87 | 				// ... not ok, or ErrWrongLeader
 88 | 			}
 89 | 		}
 90 | 		time.Sleep(100 * time.Millisecond)
 91 | 		// ask master for the latest configuration.
 92 | 		ck.config = ck.sm.Query(-1)
 93 | 	}
 94 | 
 95 | 	return ""
 96 | }
 97 | 
 98 | //
 99 | // shared by Put and Append.
100 | // You will have to modify this function.
101 | //
102 | func (ck *Clerk) PutAppend(key string, value string, op string) {
103 | 	args := PutAppendArgs{}
104 | 	args.Key = key
105 | 	args.Value = value
106 | 	args.Op = op
107 | 
108 | 	for {
109 | 		shard := key2shard(key)
110 | 		gid := ck.config.Shards[shard]
111 | 		if servers, ok := ck.config.Groups[gid]; ok {
112 | 			for si := 0; si < len(servers); si++ {
113 | 				srv := ck.make_end(servers[si])
114 | 				var reply PutAppendReply
115 | 				ok := srv.Call("ShardKV.PutAppend", &args, &reply)
116 | 				if ok && reply.Err == OK {
117 | 					return
118 | 				}
119 | 				if ok && reply.Err == ErrWrongGroup {
120 | 					break
121 | 				}
122 | 				// ... not ok, or ErrWrongLeader
123 | 			}
124 | 		}
125 | 		time.Sleep(100 * time.Millisecond)
126 | 		// ask master for the latest configuration.
127 | 		ck.config = ck.sm.Query(-1)
128 | 	}
129 | }
130 | 
131 | func (ck *Clerk) Put(key string, value string) {
132 | 	ck.PutAppend(key, value, "Put")
133 | }
134 | func (ck *Clerk) Append(key string, value string) {
135 | 	ck.PutAppend(key, value, "Append")
136 | }
137 | 


--------------------------------------------------------------------------------
/src/shardkv/common.go:
--------------------------------------------------------------------------------
 1 | package shardkv
 2 | 
 3 | //
 4 | // Sharded key/value server.
 5 | // Lots of replica groups, each running op-at-a-time paxos.
 6 | // Shardmaster decides which group serves each shard.
 7 | // Shardmaster may change shard assignment from time to time.
 8 | //
 9 | // You will have to modify these definitions.
10 | //
11 | 
12 | const (
13 | 	OK             = "OK"
14 | 	ErrNoKey       = "ErrNoKey"
15 | 	ErrWrongGroup  = "ErrWrongGroup"
16 | 	ErrWrongLeader = "ErrWrongLeader"
17 | )
18 | 
19 | type Err string
20 | 
21 | // Put or Append
22 | type PutAppendArgs struct {
23 | 	// You'll have to add definitions here.
24 | 	Key   string
25 | 	Value string
26 | 	Op    string // "Put" or "Append"
27 | 	// You'll have to add definitions here.
28 | 	// Field names must start with capital letters,
29 | 	// otherwise RPC will break.
30 | }
31 | 
32 | type PutAppendReply struct {
33 | 	Err Err
34 | }
35 | 
36 | type GetArgs struct {
37 | 	Key string
38 | 	// You'll have to add definitions here.
39 | }
40 | 
41 | type GetReply struct {
42 | 	Err   Err
43 | 	Value string
44 | }
45 | 


--------------------------------------------------------------------------------
/src/shardkv/config.go:
--------------------------------------------------------------------------------
  1 | package shardkv
  2 | 
  3 | import "shardmaster"
  4 | import "labrpc"
  5 | import "testing"
  6 | import "os"
  7 | 
  8 | // import "log"
  9 | import crand "crypto/rand"
 10 | import "math/big"
 11 | import "math/rand"
 12 | import "encoding/base64"
 13 | import "sync"
 14 | import "runtime"
 15 | import "raft"
 16 | import "strconv"
 17 | import "fmt"
 18 | import "time"
 19 | 
 20 | func randstring(n int) string {
 21 | 	b := make([]byte, 2*n)
 22 | 	crand.Read(b)
 23 | 	s := base64.URLEncoding.EncodeToString(b)
 24 | 	return s[0:n]
 25 | }
 26 | 
 27 | func makeSeed() int64 {
 28 | 	max := big.NewInt(int64(1) << 62)
 29 | 	bigx, _ := crand.Int(crand.Reader, max)
 30 | 	x := bigx.Int64()
 31 | 	return x
 32 | }
 33 | 
 34 | // Randomize server handles
 35 | func random_handles(kvh []*labrpc.ClientEnd) []*labrpc.ClientEnd {
 36 | 	sa := make([]*labrpc.ClientEnd, len(kvh))
 37 | 	copy(sa, kvh)
 38 | 	for i := range sa {
 39 | 		j := rand.Intn(i + 1)
 40 | 		sa[i], sa[j] = sa[j], sa[i]
 41 | 	}
 42 | 	return sa
 43 | }
 44 | 
 45 | type group struct {
 46 | 	gid       int
 47 | 	servers   []*ShardKV
 48 | 	saved     []*raft.Persister
 49 | 	endnames  [][]string
 50 | 	mendnames [][]string
 51 | }
 52 | 
 53 | type config struct {
 54 | 	mu    sync.Mutex
 55 | 	t     *testing.T
 56 | 	net   *labrpc.Network
 57 | 	start time.Time // time at which make_config() was called
 58 | 
 59 | 	nmasters      int
 60 | 	masterservers []*shardmaster.ShardMaster
 61 | 	mck           *shardmaster.Clerk
 62 | 
 63 | 	ngroups int
 64 | 	n       int // servers per k/v group
 65 | 	groups  []*group
 66 | 
 67 | 	clerks       map[*Clerk][]string
 68 | 	nextClientId int
 69 | 	maxraftstate int
 70 | }
 71 | 
 72 | func (cfg *config) checkTimeout() {
 73 | 	// enforce a two minute real-time limit on each test
 74 | 	if !cfg.t.Failed() && time.Since(cfg.start) > 120*time.Second {
 75 | 		cfg.t.Fatal("test took longer than 120 seconds")
 76 | 	}
 77 | }
 78 | 
 79 | func (cfg *config) cleanup() {
 80 | 	for gi := 0; gi < cfg.ngroups; gi++ {
 81 | 		cfg.ShutdownGroup(gi)
 82 | 	}
 83 | 	cfg.net.Cleanup()
 84 | 	cfg.checkTimeout()
 85 | }
 86 | 
 87 | // check that no server's log is too big.
 88 | func (cfg *config) checklogs() {
 89 | 	for gi := 0; gi < cfg.ngroups; gi++ {
 90 | 		for i := 0; i < cfg.n; i++ {
 91 | 			raft := cfg.groups[gi].saved[i].RaftStateSize()
 92 | 			snap := len(cfg.groups[gi].saved[i].ReadSnapshot())
 93 | 			if cfg.maxraftstate >= 0 && raft > 2*cfg.maxraftstate {
 94 | 				cfg.t.Fatalf("persister.RaftStateSize() %v, but maxraftstate %v",
 95 | 					raft, cfg.maxraftstate)
 96 | 			}
 97 | 			if cfg.maxraftstate < 0 && snap > 0 {
 98 | 				cfg.t.Fatalf("maxraftstate is -1, but snapshot is non-empty!")
 99 | 			}
100 | 		}
101 | 	}
102 | }
103 | 
104 | // master server name for labrpc.
105 | func (cfg *config) mastername(i int) string {
106 | 	return "master" + strconv.Itoa(i)
107 | }
108 | 
109 | // shard server name for labrpc.
110 | // i'th server of group gid.
111 | func (cfg *config) servername(gid int, i int) string {
112 | 	return "server-" + strconv.Itoa(gid) + "-" + strconv.Itoa(i)
113 | }
114 | 
115 | func (cfg *config) makeClient() *Clerk {
116 | 	cfg.mu.Lock()
117 | 	defer cfg.mu.Unlock()
118 | 
119 | 	// ClientEnds to talk to master service.
120 | 	ends := make([]*labrpc.ClientEnd, cfg.nmasters)
121 | 	endnames := make([]string, cfg.n)
122 | 	for j := 0; j < cfg.nmasters; j++ {
123 | 		endnames[j] = randstring(20)
124 | 		ends[j] = cfg.net.MakeEnd(endnames[j])
125 | 		cfg.net.Connect(endnames[j], cfg.mastername(j))
126 | 		cfg.net.Enable(endnames[j], true)
127 | 	}
128 | 
129 | 	ck := MakeClerk(ends, func(servername string) *labrpc.ClientEnd {
130 | 		name := randstring(20)
131 | 		end := cfg.net.MakeEnd(name)
132 | 		cfg.net.Connect(name, servername)
133 | 		cfg.net.Enable(name, true)
134 | 		return end
135 | 	})
136 | 	cfg.clerks[ck] = endnames
137 | 	cfg.nextClientId++
138 | 	return ck
139 | }
140 | 
141 | func (cfg *config) deleteClient(ck *Clerk) {
142 | 	cfg.mu.Lock()
143 | 	defer cfg.mu.Unlock()
144 | 
145 | 	v := cfg.clerks[ck]
146 | 	for i := 0; i < len(v); i++ {
147 | 		os.Remove(v[i])
148 | 	}
149 | 	delete(cfg.clerks, ck)
150 | }
151 | 
152 | // Shutdown i'th server of gi'th group, by isolating it
153 | func (cfg *config) ShutdownServer(gi int, i int) {
154 | 	cfg.mu.Lock()
155 | 	defer cfg.mu.Unlock()
156 | 
157 | 	gg := cfg.groups[gi]
158 | 
159 | 	// prevent this server from sending
160 | 	for j := 0; j < len(gg.servers); j++ {
161 | 		name := gg.endnames[i][j]
162 | 		cfg.net.Enable(name, false)
163 | 	}
164 | 	for j := 0; j < len(gg.mendnames[i]); j++ {
165 | 		name := gg.mendnames[i][j]
166 | 		cfg.net.Enable(name, false)
167 | 	}
168 | 
169 | 	// disable client connections to the server.
170 | 	// it's important to do this before creating
171 | 	// the new Persister in saved[i], to avoid
172 | 	// the possibility of the server returning a
173 | 	// positive reply to an Append but persisting
174 | 	// the result in the superseded Persister.
175 | 	cfg.net.DeleteServer(cfg.servername(gg.gid, i))
176 | 
177 | 	// a fresh persister, in case old instance
178 | 	// continues to update the Persister.
179 | 	// but copy old persister's content so that we always
180 | 	// pass Make() the last persisted state.
181 | 	if gg.saved[i] != nil {
182 | 		gg.saved[i] = gg.saved[i].Copy()
183 | 	}
184 | 
185 | 	kv := gg.servers[i]
186 | 	if kv != nil {
187 | 		cfg.mu.Unlock()
188 | 		kv.Kill()
189 | 		cfg.mu.Lock()
190 | 		gg.servers[i] = nil
191 | 	}
192 | }
193 | 
194 | func (cfg *config) ShutdownGroup(gi int) {
195 | 	for i := 0; i < cfg.n; i++ {
196 | 		cfg.ShutdownServer(gi, i)
197 | 	}
198 | }
199 | 
200 | // start i'th server in gi'th group
201 | func (cfg *config) StartServer(gi int, i int) {
202 | 	cfg.mu.Lock()
203 | 
204 | 	gg := cfg.groups[gi]
205 | 
206 | 	// a fresh set of outgoing ClientEnd names
207 | 	// to talk to other servers in this group.
208 | 	gg.endnames[i] = make([]string, cfg.n)
209 | 	for j := 0; j < cfg.n; j++ {
210 | 		gg.endnames[i][j] = randstring(20)
211 | 	}
212 | 
213 | 	// and the connections to other servers in this group.
214 | 	ends := make([]*labrpc.ClientEnd, cfg.n)
215 | 	for j := 0; j < cfg.n; j++ {
216 | 		ends[j] = cfg.net.MakeEnd(gg.endnames[i][j])
217 | 		cfg.net.Connect(gg.endnames[i][j], cfg.servername(gg.gid, j))
218 | 		cfg.net.Enable(gg.endnames[i][j], true)
219 | 	}
220 | 
221 | 	// ends to talk to shardmaster service
222 | 	mends := make([]*labrpc.ClientEnd, cfg.nmasters)
223 | 	gg.mendnames[i] = make([]string, cfg.nmasters)
224 | 	for j := 0; j < cfg.nmasters; j++ {
225 | 		gg.mendnames[i][j] = randstring(20)
226 | 		mends[j] = cfg.net.MakeEnd(gg.mendnames[i][j])
227 | 		cfg.net.Connect(gg.mendnames[i][j], cfg.mastername(j))
228 | 		cfg.net.Enable(gg.mendnames[i][j], true)
229 | 	}
230 | 
231 | 	// a fresh persister, so old instance doesn't overwrite
232 | 	// new instance's persisted state.
233 | 	// give the fresh persister a copy of the old persister's
234 | 	// state, so that the spec is that we pass StartKVServer()
235 | 	// the last persisted state.
236 | 	if gg.saved[i] != nil {
237 | 		gg.saved[i] = gg.saved[i].Copy()
238 | 	} else {
239 | 		gg.saved[i] = raft.MakePersister()
240 | 	}
241 | 	cfg.mu.Unlock()
242 | 
243 | 	gg.servers[i] = StartServer(ends, i, gg.saved[i], cfg.maxraftstate,
244 | 		gg.gid, mends,
245 | 		func(servername string) *labrpc.ClientEnd {
246 | 			name := randstring(20)
247 | 			end := cfg.net.MakeEnd(name)
248 | 			cfg.net.Connect(name, servername)
249 | 			cfg.net.Enable(name, true)
250 | 			return end
251 | 		})
252 | 
253 | 	kvsvc := labrpc.MakeService(gg.servers[i])
254 | 	rfsvc := labrpc.MakeService(gg.servers[i].rf)
255 | 	srv := labrpc.MakeServer()
256 | 	srv.AddService(kvsvc)
257 | 	srv.AddService(rfsvc)
258 | 	cfg.net.AddServer(cfg.servername(gg.gid, i), srv)
259 | }
260 | 
261 | func (cfg *config) StartGroup(gi int) {
262 | 	for i := 0; i < cfg.n; i++ {
263 | 		cfg.StartServer(gi, i)
264 | 	}
265 | }
266 | 
267 | func (cfg *config) StartMasterServer(i int) {
268 | 	// ClientEnds to talk to other master replicas.
269 | 	ends := make([]*labrpc.ClientEnd, cfg.nmasters)
270 | 	for j := 0; j < cfg.nmasters; j++ {
271 | 		endname := randstring(20)
272 | 		ends[j] = cfg.net.MakeEnd(endname)
273 | 		cfg.net.Connect(endname, cfg.mastername(j))
274 | 		cfg.net.Enable(endname, true)
275 | 	}
276 | 
277 | 	p := raft.MakePersister()
278 | 
279 | 	cfg.masterservers[i] = shardmaster.StartServer(ends, i, p)
280 | 
281 | 	msvc := labrpc.MakeService(cfg.masterservers[i])
282 | 	rfsvc := labrpc.MakeService(cfg.masterservers[i].Raft())
283 | 	srv := labrpc.MakeServer()
284 | 	srv.AddService(msvc)
285 | 	srv.AddService(rfsvc)
286 | 	cfg.net.AddServer(cfg.mastername(i), srv)
287 | }
288 | 
289 | func (cfg *config) shardclerk() *shardmaster.Clerk {
290 | 	// ClientEnds to talk to master service.
291 | 	ends := make([]*labrpc.ClientEnd, cfg.nmasters)
292 | 	for j := 0; j < cfg.nmasters; j++ {
293 | 		name := randstring(20)
294 | 		ends[j] = cfg.net.MakeEnd(name)
295 | 		cfg.net.Connect(name, cfg.mastername(j))
296 | 		cfg.net.Enable(name, true)
297 | 	}
298 | 
299 | 	return shardmaster.MakeClerk(ends)
300 | }
301 | 
302 | // tell the shardmaster that a group is joining.
303 | func (cfg *config) join(gi int) {
304 | 	cfg.joinm([]int{gi})
305 | }
306 | 
307 | func (cfg *config) joinm(gis []int) {
308 | 	m := make(map[int][]string, len(gis))
309 | 	for _, g := range gis {
310 | 		gid := cfg.groups[g].gid
311 | 		servernames := make([]string, cfg.n)
312 | 		for i := 0; i < cfg.n; i++ {
313 | 			servernames[i] = cfg.servername(gid, i)
314 | 		}
315 | 		m[gid] = servernames
316 | 	}
317 | 	cfg.mck.Join(m)
318 | }
319 | 
320 | // tell the shardmaster that a group is leaving.
321 | func (cfg *config) leave(gi int) {
322 | 	cfg.leavem([]int{gi})
323 | }
324 | 
325 | func (cfg *config) leavem(gis []int) {
326 | 	gids := make([]int, 0, len(gis))
327 | 	for _, g := range gis {
328 | 		gids = append(gids, cfg.groups[g].gid)
329 | 	}
330 | 	cfg.mck.Leave(gids)
331 | }
332 | 
333 | var ncpu_once sync.Once
334 | 
335 | func make_config(t *testing.T, n int, unreliable bool, maxraftstate int) *config {
336 | 	ncpu_once.Do(func() {
337 | 		if runtime.NumCPU() < 2 {
338 | 			fmt.Printf("warning: only one CPU, which may conceal locking bugs\n")
339 | 		}
340 | 		rand.Seed(makeSeed())
341 | 	})
342 | 	runtime.GOMAXPROCS(4)
343 | 	cfg := &config{}
344 | 	cfg.t = t
345 | 	cfg.maxraftstate = maxraftstate
346 | 	cfg.net = labrpc.MakeNetwork()
347 | 	cfg.start = time.Now()
348 | 
349 | 	// master
350 | 	cfg.nmasters = 3
351 | 	cfg.masterservers = make([]*shardmaster.ShardMaster, cfg.nmasters)
352 | 	for i := 0; i < cfg.nmasters; i++ {
353 | 		cfg.StartMasterServer(i)
354 | 	}
355 | 	cfg.mck = cfg.shardclerk()
356 | 
357 | 	cfg.ngroups = 3
358 | 	cfg.groups = make([]*group, cfg.ngroups)
359 | 	cfg.n = n
360 | 	for gi := 0; gi < cfg.ngroups; gi++ {
361 | 		gg := &group{}
362 | 		cfg.groups[gi] = gg
363 | 		gg.gid = 100 + gi
364 | 		gg.servers = make([]*ShardKV, cfg.n)
365 | 		gg.saved = make([]*raft.Persister, cfg.n)
366 | 		gg.endnames = make([][]string, cfg.n)
367 | 		gg.mendnames = make([][]string, cfg.nmasters)
368 | 		for i := 0; i < cfg.n; i++ {
369 | 			cfg.StartServer(gi, i)
370 | 		}
371 | 	}
372 | 
373 | 	cfg.clerks = make(map[*Clerk][]string)
374 | 	cfg.nextClientId = cfg.n + 1000 // client ids start 1000 above the highest serverid
375 | 
376 | 	cfg.net.Reliable(!unreliable)
377 | 
378 | 	return cfg
379 | }
380 | 


--------------------------------------------------------------------------------
/src/shardkv/server.go:
--------------------------------------------------------------------------------
 1 | package shardkv
 2 | 
 3 | // import "shardmaster"
 4 | import "labrpc"
 5 | import "raft"
 6 | import "sync"
 7 | import "labgob"
 8 | 
 9 | type Op struct {
10 | 	// Your definitions here.
11 | 	// Field names must start with capital letters,
12 | 	// otherwise RPC will break.
13 | }
14 | 
15 | type ShardKV struct {
16 | 	mu           sync.Mutex
17 | 	me           int
18 | 	rf           *raft.Raft
19 | 	applyCh      chan raft.ApplyMsg
20 | 	make_end     func(string) *labrpc.ClientEnd
21 | 	gid          int
22 | 	masters      []*labrpc.ClientEnd
23 | 	maxraftstate int // snapshot if log grows this big
24 | 
25 | 	// Your definitions here.
26 | }
27 | 
28 | func (kv *ShardKV) Get(args *GetArgs, reply *GetReply) {
29 | 	// Your code here.
30 | }
31 | 
32 | func (kv *ShardKV) PutAppend(args *PutAppendArgs, reply *PutAppendReply) {
33 | 	// Your code here.
34 | }
35 | 
36 | //
37 | // the tester calls Kill() when a ShardKV instance won't
38 | // be needed again. you are not required to do anything
39 | // in Kill(), but it might be convenient to (for example)
40 | // turn off debug output from this instance.
41 | //
42 | func (kv *ShardKV) Kill() {
43 | 	kv.rf.Kill()
44 | 	// Your code here, if desired.
45 | }
46 | 
47 | //
48 | // servers[] contains the ports of the servers in this group.
49 | //
50 | // me is the index of the current server in servers[].
51 | //
52 | // the k/v server should store snapshots through the underlying Raft
53 | // implementation, which should call persister.SaveStateAndSnapshot() to
54 | // atomically save the Raft state along with the snapshot.
55 | //
56 | // the k/v server should snapshot when Raft's saved state exceeds
57 | // maxraftstate bytes, in order to allow Raft to garbage-collect its
58 | // log. if maxraftstate is -1, you don't need to snapshot.
59 | //
60 | // gid is this group's GID, for interacting with the shardmaster.
61 | //
62 | // pass masters[] to shardmaster.MakeClerk() so you can send
63 | // RPCs to the shardmaster.
64 | //
65 | // make_end(servername) turns a server name from a
66 | // Config.Groups[gid][i] into a labrpc.ClientEnd on which you can
67 | // send RPCs. You'll need this to send RPCs to other groups.
68 | //
69 | // look at client.go for examples of how to use masters[]
70 | // and make_end() to send RPCs to the group owning a specific shard.
71 | //
72 | // StartServer() must return quickly, so it should start goroutines
73 | // for any long-running work.
74 | //
75 | func StartServer(servers []*labrpc.ClientEnd, me int, persister *raft.Persister, maxraftstate int, gid int, masters []*labrpc.ClientEnd, make_end func(string) *labrpc.ClientEnd) *ShardKV {
76 | 	// call labgob.Register on structures you want
77 | 	// Go's RPC library to marshall/unmarshall.
78 | 	labgob.Register(Op{})
79 | 
80 | 	kv := new(ShardKV)
81 | 	kv.me = me
82 | 	kv.maxraftstate = maxraftstate
83 | 	kv.make_end = make_end
84 | 	kv.gid = gid
85 | 	kv.masters = masters
86 | 
87 | 	// Your initialization code here.
88 | 
89 | 	// Use something like this to talk to the shardmaster:
90 | 	// kv.mck = shardmaster.MakeClerk(kv.masters)
91 | 
92 | 	kv.applyCh = make(chan raft.ApplyMsg)
93 | 	kv.rf = raft.Make(servers, me, persister, kv.applyCh)
94 | 
95 | 	return kv
96 | }
97 | 


--------------------------------------------------------------------------------
/src/shardkv/test_test.go:
--------------------------------------------------------------------------------
  1 | package shardkv
  2 | 
  3 | import "linearizability"
  4 | 
  5 | import "testing"
  6 | import "strconv"
  7 | import "time"
  8 | import "fmt"
  9 | import "sync/atomic"
 10 | import "sync"
 11 | import "math/rand"
 12 | 
 13 | const linearizabilityCheckTimeout = 1 * time.Second
 14 | 
 15 | func check(t *testing.T, ck *Clerk, key string, value string) {
 16 | 	v := ck.Get(key)
 17 | 	if v != value {
 18 | 		t.Fatalf("Get(%v): expected:\n%v\nreceived:\n%v", key, value, v)
 19 | 	}
 20 | }
 21 | 
 22 | //
 23 | // test static 2-way sharding, without shard movement.
 24 | //
 25 | func TestStaticShards(t *testing.T) {
 26 | 	fmt.Printf("Test: static shards ...\n")
 27 | 
 28 | 	cfg := make_config(t, 3, false, -1)
 29 | 	defer cfg.cleanup()
 30 | 
 31 | 	ck := cfg.makeClient()
 32 | 
 33 | 	cfg.join(0)
 34 | 	cfg.join(1)
 35 | 
 36 | 	n := 10
 37 | 	ka := make([]string, n)
 38 | 	va := make([]string, n)
 39 | 	for i := 0; i < n; i++ {
 40 | 		ka[i] = strconv.Itoa(i) // ensure multiple shards
 41 | 		va[i] = randstring(20)
 42 | 		ck.Put(ka[i], va[i])
 43 | 	}
 44 | 	for i := 0; i < n; i++ {
 45 | 		check(t, ck, ka[i], va[i])
 46 | 	}
 47 | 
 48 | 	// make sure that the data really is sharded by
 49 | 	// shutting down one shard and checking that some
 50 | 	// Get()s don't succeed.
 51 | 	cfg.ShutdownGroup(1)
 52 | 	cfg.checklogs() // forbid snapshots
 53 | 
 54 | 	ch := make(chan bool)
 55 | 	for xi := 0; xi < n; xi++ {
 56 | 		ck1 := cfg.makeClient() // only one call allowed per client
 57 | 		go func(i int) {
 58 | 			defer func() { ch <- true }()
 59 | 			check(t, ck1, ka[i], va[i])
 60 | 		}(xi)
 61 | 	}
 62 | 
 63 | 	// wait a bit, only about half the Gets should succeed.
 64 | 	ndone := 0
 65 | 	done := false
 66 | 	for done == false {
 67 | 		select {
 68 | 		case <-ch:
 69 | 			ndone += 1
 70 | 		case <-time.After(time.Second * 2):
 71 | 			done = true
 72 | 			break
 73 | 		}
 74 | 	}
 75 | 
 76 | 	if ndone != 5 {
 77 | 		t.Fatalf("expected 5 completions with one shard dead; got %v\n", ndone)
 78 | 	}
 79 | 
 80 | 	// bring the crashed shard/group back to life.
 81 | 	cfg.StartGroup(1)
 82 | 	for i := 0; i < n; i++ {
 83 | 		check(t, ck, ka[i], va[i])
 84 | 	}
 85 | 
 86 | 	fmt.Printf("  ... Passed\n")
 87 | }
 88 | 
 89 | func TestJoinLeave(t *testing.T) {
 90 | 	fmt.Printf("Test: join then leave ...\n")
 91 | 
 92 | 	cfg := make_config(t, 3, false, -1)
 93 | 	defer cfg.cleanup()
 94 | 
 95 | 	ck := cfg.makeClient()
 96 | 
 97 | 	cfg.join(0)
 98 | 
 99 | 	n := 10
100 | 	ka := make([]string, n)
101 | 	va := make([]string, n)
102 | 	for i := 0; i < n; i++ {
103 | 		ka[i] = strconv.Itoa(i) // ensure multiple shards
104 | 		va[i] = randstring(5)
105 | 		ck.Put(ka[i], va[i])
106 | 	}
107 | 	for i := 0; i < n; i++ {
108 | 		check(t, ck, ka[i], va[i])
109 | 	}
110 | 
111 | 	cfg.join(1)
112 | 
113 | 	for i := 0; i < n; i++ {
114 | 		check(t, ck, ka[i], va[i])
115 | 		x := randstring(5)
116 | 		ck.Append(ka[i], x)
117 | 		va[i] += x
118 | 	}
119 | 
120 | 	cfg.leave(0)
121 | 
122 | 	for i := 0; i < n; i++ {
123 | 		check(t, ck, ka[i], va[i])
124 | 		x := randstring(5)
125 | 		ck.Append(ka[i], x)
126 | 		va[i] += x
127 | 	}
128 | 
129 | 	// allow time for shards to transfer.
130 | 	time.Sleep(1 * time.Second)
131 | 
132 | 	cfg.checklogs()
133 | 	cfg.ShutdownGroup(0)
134 | 
135 | 	for i := 0; i < n; i++ {
136 | 		check(t, ck, ka[i], va[i])
137 | 	}
138 | 
139 | 	fmt.Printf("  ... Passed\n")
140 | }
141 | 
142 | func TestSnapshot(t *testing.T) {
143 | 	fmt.Printf("Test: snapshots, join, and leave ...\n")
144 | 
145 | 	cfg := make_config(t, 3, false, 1000)
146 | 	defer cfg.cleanup()
147 | 
148 | 	ck := cfg.makeClient()
149 | 
150 | 	cfg.join(0)
151 | 
152 | 	n := 30
153 | 	ka := make([]string, n)
154 | 	va := make([]string, n)
155 | 	for i := 0; i < n; i++ {
156 | 		ka[i] = strconv.Itoa(i) // ensure multiple shards
157 | 		va[i] = randstring(20)
158 | 		ck.Put(ka[i], va[i])
159 | 	}
160 | 	for i := 0; i < n; i++ {
161 | 		check(t, ck, ka[i], va[i])
162 | 	}
163 | 
164 | 	cfg.join(1)
165 | 	cfg.join(2)
166 | 	cfg.leave(0)
167 | 
168 | 	for i := 0; i < n; i++ {
169 | 		check(t, ck, ka[i], va[i])
170 | 		x := randstring(20)
171 | 		ck.Append(ka[i], x)
172 | 		va[i] += x
173 | 	}
174 | 
175 | 	cfg.leave(1)
176 | 	cfg.join(0)
177 | 
178 | 	for i := 0; i < n; i++ {
179 | 		check(t, ck, ka[i], va[i])
180 | 		x := randstring(20)
181 | 		ck.Append(ka[i], x)
182 | 		va[i] += x
183 | 	}
184 | 
185 | 	time.Sleep(1 * time.Second)
186 | 
187 | 	for i := 0; i < n; i++ {
188 | 		check(t, ck, ka[i], va[i])
189 | 	}
190 | 
191 | 	time.Sleep(1 * time.Second)
192 | 
193 | 	cfg.checklogs()
194 | 
195 | 	cfg.ShutdownGroup(0)
196 | 	cfg.ShutdownGroup(1)
197 | 	cfg.ShutdownGroup(2)
198 | 
199 | 	cfg.StartGroup(0)
200 | 	cfg.StartGroup(1)
201 | 	cfg.StartGroup(2)
202 | 
203 | 	for i := 0; i < n; i++ {
204 | 		check(t, ck, ka[i], va[i])
205 | 	}
206 | 
207 | 	fmt.Printf("  ... Passed\n")
208 | }
209 | 
210 | func TestMissChange(t *testing.T) {
211 | 	fmt.Printf("Test: servers miss configuration changes...\n")
212 | 
213 | 	cfg := make_config(t, 3, false, 1000)
214 | 	defer cfg.cleanup()
215 | 
216 | 	ck := cfg.makeClient()
217 | 
218 | 	cfg.join(0)
219 | 
220 | 	n := 10
221 | 	ka := make([]string, n)
222 | 	va := make([]string, n)
223 | 	for i := 0; i < n; i++ {
224 | 		ka[i] = strconv.Itoa(i) // ensure multiple shards
225 | 		va[i] = randstring(20)
226 | 		ck.Put(ka[i], va[i])
227 | 	}
228 | 	for i := 0; i < n; i++ {
229 | 		check(t, ck, ka[i], va[i])
230 | 	}
231 | 
232 | 	cfg.join(1)
233 | 
234 | 	cfg.ShutdownServer(0, 0)
235 | 	cfg.ShutdownServer(1, 0)
236 | 	cfg.ShutdownServer(2, 0)
237 | 
238 | 	cfg.join(2)
239 | 	cfg.leave(1)
240 | 	cfg.leave(0)
241 | 
242 | 	for i := 0; i < n; i++ {
243 | 		check(t, ck, ka[i], va[i])
244 | 		x := randstring(20)
245 | 		ck.Append(ka[i], x)
246 | 		va[i] += x
247 | 	}
248 | 
249 | 	cfg.join(1)
250 | 
251 | 	for i := 0; i < n; i++ {
252 | 		check(t, ck, ka[i], va[i])
253 | 		x := randstring(20)
254 | 		ck.Append(ka[i], x)
255 | 		va[i] += x
256 | 	}
257 | 
258 | 	cfg.StartServer(0, 0)
259 | 	cfg.StartServer(1, 0)
260 | 	cfg.StartServer(2, 0)
261 | 
262 | 	for i := 0; i < n; i++ {
263 | 		check(t, ck, ka[i], va[i])
264 | 		x := randstring(20)
265 | 		ck.Append(ka[i], x)
266 | 		va[i] += x
267 | 	}
268 | 
269 | 	time.Sleep(2 * time.Second)
270 | 
271 | 	cfg.ShutdownServer(0, 1)
272 | 	cfg.ShutdownServer(1, 1)
273 | 	cfg.ShutdownServer(2, 1)
274 | 
275 | 	cfg.join(0)
276 | 	cfg.leave(2)
277 | 
278 | 	for i := 0; i < n; i++ {
279 | 		check(t, ck, ka[i], va[i])
280 | 		x := randstring(20)
281 | 		ck.Append(ka[i], x)
282 | 		va[i] += x
283 | 	}
284 | 
285 | 	cfg.StartServer(0, 1)
286 | 	cfg.StartServer(1, 1)
287 | 	cfg.StartServer(2, 1)
288 | 
289 | 	for i := 0; i < n; i++ {
290 | 		check(t, ck, ka[i], va[i])
291 | 	}
292 | 
293 | 	fmt.Printf("  ... Passed\n")
294 | }
295 | 
296 | func TestConcurrent1(t *testing.T) {
297 | 	fmt.Printf("Test: concurrent puts and configuration changes...\n")
298 | 
299 | 	cfg := make_config(t, 3, false, 100)
300 | 	defer cfg.cleanup()
301 | 
302 | 	ck := cfg.makeClient()
303 | 
304 | 	cfg.join(0)
305 | 
306 | 	n := 10
307 | 	ka := make([]string, n)
308 | 	va := make([]string, n)
309 | 	for i := 0; i < n; i++ {
310 | 		ka[i] = strconv.Itoa(i) // ensure multiple shards
311 | 		va[i] = randstring(5)
312 | 		ck.Put(ka[i], va[i])
313 | 	}
314 | 
315 | 	var done int32
316 | 	ch := make(chan bool)
317 | 
318 | 	ff := func(i int) {
319 | 		defer func() { ch <- true }()
320 | 		ck1 := cfg.makeClient()
321 | 		for atomic.LoadInt32(&done) == 0 {
322 | 			x := randstring(5)
323 | 			ck1.Append(ka[i], x)
324 | 			va[i] += x
325 | 			time.Sleep(10 * time.Millisecond)
326 | 		}
327 | 	}
328 | 
329 | 	for i := 0; i < n; i++ {
330 | 		go ff(i)
331 | 	}
332 | 
333 | 	time.Sleep(150 * time.Millisecond)
334 | 	cfg.join(1)
335 | 	time.Sleep(500 * time.Millisecond)
336 | 	cfg.join(2)
337 | 	time.Sleep(500 * time.Millisecond)
338 | 	cfg.leave(0)
339 | 
340 | 	cfg.ShutdownGroup(0)
341 | 	time.Sleep(100 * time.Millisecond)
342 | 	cfg.ShutdownGroup(1)
343 | 	time.Sleep(100 * time.Millisecond)
344 | 	cfg.ShutdownGroup(2)
345 | 
346 | 	cfg.leave(2)
347 | 
348 | 	time.Sleep(100 * time.Millisecond)
349 | 	cfg.StartGroup(0)
350 | 	cfg.StartGroup(1)
351 | 	cfg.StartGroup(2)
352 | 
353 | 	time.Sleep(100 * time.Millisecond)
354 | 	cfg.join(0)
355 | 	cfg.leave(1)
356 | 	time.Sleep(500 * time.Millisecond)
357 | 	cfg.join(1)
358 | 
359 | 	time.Sleep(1 * time.Second)
360 | 
361 | 	atomic.StoreInt32(&done, 1)
362 | 	for i := 0; i < n; i++ {
363 | 		<-ch
364 | 	}
365 | 
366 | 	for i := 0; i < n; i++ {
367 | 		check(t, ck, ka[i], va[i])
368 | 	}
369 | 
370 | 	fmt.Printf("  ... Passed\n")
371 | }
372 | 
373 | //
374 | // this tests the various sources from which a re-starting
375 | // group might need to fetch shard contents.
376 | //
377 | func TestConcurrent2(t *testing.T) {
378 | 	fmt.Printf("Test: more concurrent puts and configuration changes...\n")
379 | 
380 | 	cfg := make_config(t, 3, false, -1)
381 | 	defer cfg.cleanup()
382 | 
383 | 	ck := cfg.makeClient()
384 | 
385 | 	cfg.join(1)
386 | 	cfg.join(0)
387 | 	cfg.join(2)
388 | 
389 | 	n := 10
390 | 	ka := make([]string, n)
391 | 	va := make([]string, n)
392 | 	for i := 0; i < n; i++ {
393 | 		ka[i] = strconv.Itoa(i) // ensure multiple shards
394 | 		va[i] = randstring(1)
395 | 		ck.Put(ka[i], va[i])
396 | 	}
397 | 
398 | 	var done int32
399 | 	ch := make(chan bool)
400 | 
401 | 	ff := func(i int, ck1 *Clerk) {
402 | 		defer func() { ch <- true }()
403 | 		for atomic.LoadInt32(&done) == 0 {
404 | 			x := randstring(1)
405 | 			ck1.Append(ka[i], x)
406 | 			va[i] += x
407 | 			time.Sleep(50 * time.Millisecond)
408 | 		}
409 | 	}
410 | 
411 | 	for i := 0; i < n; i++ {
412 | 		ck1 := cfg.makeClient()
413 | 		go ff(i, ck1)
414 | 	}
415 | 
416 | 	cfg.leave(0)
417 | 	cfg.leave(2)
418 | 	time.Sleep(3000 * time.Millisecond)
419 | 	cfg.join(0)
420 | 	cfg.join(2)
421 | 	cfg.leave(1)
422 | 	time.Sleep(3000 * time.Millisecond)
423 | 	cfg.join(1)
424 | 	cfg.leave(0)
425 | 	cfg.leave(2)
426 | 	time.Sleep(3000 * time.Millisecond)
427 | 
428 | 	cfg.ShutdownGroup(1)
429 | 	cfg.ShutdownGroup(2)
430 | 	time.Sleep(1000 * time.Millisecond)
431 | 	cfg.StartGroup(1)
432 | 	cfg.StartGroup(2)
433 | 
434 | 	time.Sleep(2 * time.Second)
435 | 
436 | 	atomic.StoreInt32(&done, 1)
437 | 	for i := 0; i < n; i++ {
438 | 		<-ch
439 | 	}
440 | 
441 | 	for i := 0; i < n; i++ {
442 | 		check(t, ck, ka[i], va[i])
443 | 	}
444 | 
445 | 	fmt.Printf("  ... Passed\n")
446 | }
447 | 
448 | func TestUnreliable1(t *testing.T) {
449 | 	fmt.Printf("Test: unreliable 1...\n")
450 | 
451 | 	cfg := make_config(t, 3, true, 100)
452 | 	defer cfg.cleanup()
453 | 
454 | 	ck := cfg.makeClient()
455 | 
456 | 	cfg.join(0)
457 | 
458 | 	n := 10
459 | 	ka := make([]string, n)
460 | 	va := make([]string, n)
461 | 	for i := 0; i < n; i++ {
462 | 		ka[i] = strconv.Itoa(i) // ensure multiple shards
463 | 		va[i] = randstring(5)
464 | 		ck.Put(ka[i], va[i])
465 | 	}
466 | 
467 | 	cfg.join(1)
468 | 	cfg.join(2)
469 | 	cfg.leave(0)
470 | 
471 | 	for ii := 0; ii < n*2; ii++ {
472 | 		i := ii % n
473 | 		check(t, ck, ka[i], va[i])
474 | 		x := randstring(5)
475 | 		ck.Append(ka[i], x)
476 | 		va[i] += x
477 | 	}
478 | 
479 | 	cfg.join(0)
480 | 	cfg.leave(1)
481 | 
482 | 	for ii := 0; ii < n*2; ii++ {
483 | 		i := ii % n
484 | 		check(t, ck, ka[i], va[i])
485 | 	}
486 | 
487 | 	fmt.Printf("  ... Passed\n")
488 | }
489 | 
490 | func TestUnreliable2(t *testing.T) {
491 | 	fmt.Printf("Test: unreliable 2...\n")
492 | 
493 | 	cfg := make_config(t, 3, true, 100)
494 | 	defer cfg.cleanup()
495 | 
496 | 	ck := cfg.makeClient()
497 | 
498 | 	cfg.join(0)
499 | 
500 | 	n := 10
501 | 	ka := make([]string, n)
502 | 	va := make([]string, n)
503 | 	for i := 0; i < n; i++ {
504 | 		ka[i] = strconv.Itoa(i) // ensure multiple shards
505 | 		va[i] = randstring(5)
506 | 		ck.Put(ka[i], va[i])
507 | 	}
508 | 
509 | 	var done int32
510 | 	ch := make(chan bool)
511 | 
512 | 	ff := func(i int) {
513 | 		defer func() { ch <- true }()
514 | 		ck1 := cfg.makeClient()
515 | 		for atomic.LoadInt32(&done) == 0 {
516 | 			x := randstring(5)
517 | 			ck1.Append(ka[i], x)
518 | 			va[i] += x
519 | 		}
520 | 	}
521 | 
522 | 	for i := 0; i < n; i++ {
523 | 		go ff(i)
524 | 	}
525 | 
526 | 	time.Sleep(150 * time.Millisecond)
527 | 	cfg.join(1)
528 | 	time.Sleep(500 * time.Millisecond)
529 | 	cfg.join(2)
530 | 	time.Sleep(500 * time.Millisecond)
531 | 	cfg.leave(0)
532 | 	time.Sleep(500 * time.Millisecond)
533 | 	cfg.leave(1)
534 | 	time.Sleep(500 * time.Millisecond)
535 | 	cfg.join(1)
536 | 	cfg.join(0)
537 | 
538 | 	time.Sleep(2 * time.Second)
539 | 
540 | 	atomic.StoreInt32(&done, 1)
541 | 	cfg.net.Reliable(true)
542 | 	for i := 0; i < n; i++ {
543 | 		<-ch
544 | 	}
545 | 
546 | 	for i := 0; i < n; i++ {
547 | 		check(t, ck, ka[i], va[i])
548 | 	}
549 | 
550 | 	fmt.Printf("  ... Passed\n")
551 | }
552 | 
553 | func TestUnreliable3(t *testing.T) {
554 | 	fmt.Printf("Test: unreliable 3...\n")
555 | 
556 | 	cfg := make_config(t, 3, true, 100)
557 | 	defer cfg.cleanup()
558 | 
559 | 	begin := time.Now()
560 | 	var operations []linearizability.Operation
561 | 	var opMu sync.Mutex
562 | 
563 | 	ck := cfg.makeClient()
564 | 
565 | 	cfg.join(0)
566 | 
567 | 	n := 10
568 | 	ka := make([]string, n)
569 | 	va := make([]string, n)
570 | 	for i := 0; i < n; i++ {
571 | 		ka[i] = strconv.Itoa(i) // ensure multiple shards
572 | 		va[i] = randstring(5)
573 | 		start := int64(time.Since(begin))
574 | 		ck.Put(ka[i], va[i])
575 | 		end := int64(time.Since(begin))
576 | 		inp := linearizability.KvInput{Op: 1, Key: ka[i], Value: va[i]}
577 | 		var out linearizability.KvOutput
578 | 		op := linearizability.Operation{Input: inp, Call: start, Output: out, Return: end}
579 | 		operations = append(operations, op)
580 | 	}
581 | 
582 | 	var done int32
583 | 	ch := make(chan bool)
584 | 
585 | 	ff := func(i int) {
586 | 		defer func() { ch <- true }()
587 | 		ck1 := cfg.makeClient()
588 | 		for atomic.LoadInt32(&done) == 0 {
589 | 			ki := rand.Int() % n
590 | 			nv := randstring(5)
591 | 			var inp linearizability.KvInput
592 | 			var out linearizability.KvOutput
593 | 			start := int64(time.Since(begin))
594 | 			if (rand.Int() % 1000) < 500 {
595 | 				ck1.Append(ka[ki], nv)
596 | 				inp = linearizability.KvInput{Op: 2, Key: ka[ki], Value: nv}
597 | 			} else if (rand.Int() % 1000) < 100 {
598 | 				ck1.Put(ka[ki], nv)
599 | 				inp = linearizability.KvInput{Op: 1, Key: ka[ki], Value: nv}
600 | 			} else {
601 | 				v := ck1.Get(ka[ki])
602 | 				inp = linearizability.KvInput{Op: 0, Key: ka[ki]}
603 | 				out = linearizability.KvOutput{Value: v}
604 | 			}
605 | 			end := int64(time.Since(begin))
606 | 			op := linearizability.Operation{Input: inp, Call: start, Output: out, Return: end}
607 | 			opMu.Lock()
608 | 			operations = append(operations, op)
609 | 			opMu.Unlock()
610 | 		}
611 | 	}
612 | 
613 | 	for i := 0; i < n; i++ {
614 | 		go ff(i)
615 | 	}
616 | 
617 | 	time.Sleep(150 * time.Millisecond)
618 | 	cfg.join(1)
619 | 	time.Sleep(500 * time.Millisecond)
620 | 	cfg.join(2)
621 | 	time.Sleep(500 * time.Millisecond)
622 | 	cfg.leave(0)
623 | 	time.Sleep(500 * time.Millisecond)
624 | 	cfg.leave(1)
625 | 	time.Sleep(500 * time.Millisecond)
626 | 	cfg.join(1)
627 | 	cfg.join(0)
628 | 
629 | 	time.Sleep(2 * time.Second)
630 | 
631 | 	atomic.StoreInt32(&done, 1)
632 | 	cfg.net.Reliable(true)
633 | 	for i := 0; i < n; i++ {
634 | 		<-ch
635 | 	}
636 | 
637 | 	// log.Printf("Checking linearizability of %d operations", len(operations))
638 | 	// start := time.Now()
639 | 	ok := linearizability.CheckOperationsTimeout(linearizability.KvModel(), operations, linearizabilityCheckTimeout)
640 | 	// dur := time.Since(start)
641 | 	// log.Printf("Linearizability check done in %s; result: %t", time.Since(start).String(), ok)
642 | 	if !ok {
643 | 		t.Fatal("history is not linearizable")
644 | 	}
645 | 
646 | 	fmt.Printf("  ... Passed\n")
647 | }
648 | 
649 | //
650 | // optional test to see whether servers are deleting
651 | // shards for which they are no longer responsible.
652 | //
653 | func TestChallenge1Delete(t *testing.T) {
654 | 	fmt.Printf("Test: shard deletion (challenge 1) ...\n")
655 | 
656 | 	// "1" means force snapshot after every log entry.
657 | 	cfg := make_config(t, 3, false, 1)
658 | 	defer cfg.cleanup()
659 | 
660 | 	ck := cfg.makeClient()
661 | 
662 | 	cfg.join(0)
663 | 
664 | 	// 30,000 bytes of total values.
665 | 	n := 30
666 | 	ka := make([]string, n)
667 | 	va := make([]string, n)
668 | 	for i := 0; i < n; i++ {
669 | 		ka[i] = strconv.Itoa(i)
670 | 		va[i] = randstring(1000)
671 | 		ck.Put(ka[i], va[i])
672 | 	}
673 | 	for i := 0; i < 3; i++ {
674 | 		check(t, ck, ka[i], va[i])
675 | 	}
676 | 
677 | 	for iters := 0; iters < 2; iters++ {
678 | 		cfg.join(1)
679 | 		cfg.leave(0)
680 | 		cfg.join(2)
681 | 		time.Sleep(3 * time.Second)
682 | 		for i := 0; i < 3; i++ {
683 | 			check(t, ck, ka[i], va[i])
684 | 		}
685 | 		cfg.leave(1)
686 | 		cfg.join(0)
687 | 		cfg.leave(2)
688 | 		time.Sleep(3 * time.Second)
689 | 		for i := 0; i < 3; i++ {
690 | 			check(t, ck, ka[i], va[i])
691 | 		}
692 | 	}
693 | 
694 | 	cfg.join(1)
695 | 	cfg.join(2)
696 | 	time.Sleep(1 * time.Second)
697 | 	for i := 0; i < 3; i++ {
698 | 		check(t, ck, ka[i], va[i])
699 | 	}
700 | 	time.Sleep(1 * time.Second)
701 | 	for i := 0; i < 3; i++ {
702 | 		check(t, ck, ka[i], va[i])
703 | 	}
704 | 	time.Sleep(1 * time.Second)
705 | 	for i := 0; i < 3; i++ {
706 | 		check(t, ck, ka[i], va[i])
707 | 	}
708 | 
709 | 	total := 0
710 | 	for gi := 0; gi < cfg.ngroups; gi++ {
711 | 		for i := 0; i < cfg.n; i++ {
712 | 			raft := cfg.groups[gi].saved[i].RaftStateSize()
713 | 			snap := len(cfg.groups[gi].saved[i].ReadSnapshot())
714 | 			total += raft + snap
715 | 		}
716 | 	}
717 | 
718 | 	// 27 keys should be stored once.
719 | 	// 3 keys should also be stored in client dup tables.
720 | 	// everything on 3 replicas.
721 | 	// plus slop.
722 | 	expected := 3 * (((n - 3) * 1000) + 2*3*1000 + 6000)
723 | 	if total > expected {
724 | 		t.Fatalf("snapshot + persisted Raft state are too big: %v > %v\n", total, expected)
725 | 	}
726 | 
727 | 	for i := 0; i < n; i++ {
728 | 		check(t, ck, ka[i], va[i])
729 | 	}
730 | 
731 | 	fmt.Printf("  ... Passed\n")
732 | }
733 | 
734 | func TestChallenge1Concurrent(t *testing.T) {
735 | 	fmt.Printf("Test: concurrent configuration change and restart (challenge 1)...\n")
736 | 
737 | 	cfg := make_config(t, 3, false, 300)
738 | 	defer cfg.cleanup()
739 | 
740 | 	ck := cfg.makeClient()
741 | 
742 | 	cfg.join(0)
743 | 
744 | 	n := 10
745 | 	ka := make([]string, n)
746 | 	va := make([]string, n)
747 | 	for i := 0; i < n; i++ {
748 | 		ka[i] = strconv.Itoa(i)
749 | 		va[i] = randstring(1)
750 | 		ck.Put(ka[i], va[i])
751 | 	}
752 | 
753 | 	var done int32
754 | 	ch := make(chan bool)
755 | 
756 | 	ff := func(i int, ck1 *Clerk) {
757 | 		defer func() { ch <- true }()
758 | 		for atomic.LoadInt32(&done) == 0 {
759 | 			x := randstring(1)
760 | 			ck1.Append(ka[i], x)
761 | 			va[i] += x
762 | 		}
763 | 	}
764 | 
765 | 	for i := 0; i < n; i++ {
766 | 		ck1 := cfg.makeClient()
767 | 		go ff(i, ck1)
768 | 	}
769 | 
770 | 	t0 := time.Now()
771 | 	for time.Since(t0) < 12*time.Second {
772 | 		cfg.join(2)
773 | 		cfg.join(1)
774 | 		time.Sleep(time.Duration(rand.Int()%900) * time.Millisecond)
775 | 		cfg.ShutdownGroup(0)
776 | 		cfg.ShutdownGroup(1)
777 | 		cfg.ShutdownGroup(2)
778 | 		cfg.StartGroup(0)
779 | 		cfg.StartGroup(1)
780 | 		cfg.StartGroup(2)
781 | 
782 | 		time.Sleep(time.Duration(rand.Int()%900) * time.Millisecond)
783 | 		cfg.leave(1)
784 | 		cfg.leave(2)
785 | 		time.Sleep(time.Duration(rand.Int()%900) * time.Millisecond)
786 | 	}
787 | 
788 | 	time.Sleep(2 * time.Second)
789 | 
790 | 	atomic.StoreInt32(&done, 1)
791 | 	for i := 0; i < n; i++ {
792 | 		<-ch
793 | 	}
794 | 
795 | 	for i := 0; i < n; i++ {
796 | 		check(t, ck, ka[i], va[i])
797 | 	}
798 | 
799 | 	fmt.Printf("  ... Passed\n")
800 | }
801 | 
802 | //
803 | // optional test to see whether servers can handle
804 | // shards that are not affected by a config change
805 | // while the config change is underway
806 | //
807 | func TestChallenge2Unaffected(t *testing.T) {
808 | 	fmt.Printf("Test: unaffected shard access (challenge 2) ...\n")
809 | 
810 | 	cfg := make_config(t, 3, true, 100)
811 | 	defer cfg.cleanup()
812 | 
813 | 	ck := cfg.makeClient()
814 | 
815 | 	// JOIN 100
816 | 	cfg.join(0)
817 | 
818 | 	// Do a bunch of puts to keys in all shards
819 | 	n := 10
820 | 	ka := make([]string, n)
821 | 	va := make([]string, n)
822 | 	for i := 0; i < n; i++ {
823 | 		ka[i] = strconv.Itoa(i) // ensure multiple shards
824 | 		va[i] = "100"
825 | 		ck.Put(ka[i], va[i])
826 | 	}
827 | 
828 | 	// JOIN 101
829 | 	cfg.join(1)
830 | 
831 | 	// QUERY to find shards now owned by 101
832 | 	c := cfg.mck.Query(-1)
833 | 	owned := make(map[int]bool, n)
834 | 	for s, gid := range c.Shards {
835 | 		owned[s] = gid == cfg.groups[1].gid
836 | 	}
837 | 
838 | 	// Wait for migration to new config to complete, and for clients to
839 | 	// start using this updated config. Gets to any key k such that
840 | 	// owned[shard(k)] == true should now be served by group 101.
841 | 	<-time.After(1 * time.Second)
842 | 	for i := 0; i < n; i++ {
843 | 		if owned[i] {
844 | 			va[i] = "101"
845 | 			ck.Put(ka[i], va[i])
846 | 		}
847 | 	}
848 | 
849 | 	// KILL 100
850 | 	cfg.ShutdownGroup(0)
851 | 
852 | 	// LEAVE 100
853 | 	// 101 doesn't get a chance to migrate things previously owned by 100
854 | 	cfg.leave(0)
855 | 
856 | 	// Wait to make sure clients see new config
857 | 	<-time.After(1 * time.Second)
858 | 
859 | 	// And finally: check that gets/puts for 101-owned keys still complete
860 | 	for i := 0; i < n; i++ {
861 | 		shard := int(ka[i][0]) % 10
862 | 		if owned[shard] {
863 | 			check(t, ck, ka[i], va[i])
864 | 			ck.Put(ka[i], va[i]+"-1")
865 | 			check(t, ck, ka[i], va[i]+"-1")
866 | 		}
867 | 	}
868 | 
869 | 	fmt.Printf("  ... Passed\n")
870 | }
871 | 
872 | //
873 | // optional test to see whether servers can handle operations on shards that
874 | // have been received as a part of a config migration when the entire migration
875 | // has not yet completed.
876 | //
877 | func TestChallenge2Partial(t *testing.T) {
878 | 	fmt.Printf("Test: partial migration shard access (challenge 2) ...\n")
879 | 
880 | 	cfg := make_config(t, 3, true, 100)
881 | 	defer cfg.cleanup()
882 | 
883 | 	ck := cfg.makeClient()
884 | 
885 | 	// JOIN 100 + 101 + 102
886 | 	cfg.joinm([]int{0, 1, 2})
887 | 
888 | 	// Give the implementation some time to reconfigure
889 | 	<-time.After(1 * time.Second)
890 | 
891 | 	// Do a bunch of puts to keys in all shards
892 | 	n := 10
893 | 	ka := make([]string, n)
894 | 	va := make([]string, n)
895 | 	for i := 0; i < n; i++ {
896 | 		ka[i] = strconv.Itoa(i) // ensure multiple shards
897 | 		va[i] = "100"
898 | 		ck.Put(ka[i], va[i])
899 | 	}
900 | 
901 | 	// QUERY to find shards owned by 102
902 | 	c := cfg.mck.Query(-1)
903 | 	owned := make(map[int]bool, n)
904 | 	for s, gid := range c.Shards {
905 | 		owned[s] = gid == cfg.groups[2].gid
906 | 	}
907 | 
908 | 	// KILL 100
909 | 	cfg.ShutdownGroup(0)
910 | 
911 | 	// LEAVE 100 + 102
912 | 	// 101 can get old shards from 102, but not from 100. 101 should start
913 | 	// serving shards that used to belong to 102 as soon as possible
914 | 	cfg.leavem([]int{0, 2})
915 | 
916 | 	// Give the implementation some time to start reconfiguration
917 | 	// And to migrate 102 -> 101
918 | 	<-time.After(1 * time.Second)
919 | 
920 | 	// And finally: check that gets/puts for 101-owned keys now complete
921 | 	for i := 0; i < n; i++ {
922 | 		shard := key2shard(ka[i])
923 | 		if owned[shard] {
924 | 			check(t, ck, ka[i], va[i])
925 | 			ck.Put(ka[i], va[i]+"-2")
926 | 			check(t, ck, ka[i], va[i]+"-2")
927 | 		}
928 | 	}
929 | 
930 | 	fmt.Printf("  ... Passed\n")
931 | }
932 | 


--------------------------------------------------------------------------------
/src/shardmaster/client.go:
--------------------------------------------------------------------------------
  1 | package shardmaster
  2 | 
  3 | //
  4 | // Shardmaster clerk.
  5 | //
  6 | 
  7 | import "labrpc"
  8 | import "time"
  9 | import "crypto/rand"
 10 | import "math/big"
 11 | 
 12 | type Clerk struct {
 13 | 	servers []*labrpc.ClientEnd
 14 | 	// Your data here.
 15 | }
 16 | 
 17 | func nrand() int64 {
 18 | 	max := big.NewInt(int64(1) << 62)
 19 | 	bigx, _ := rand.Int(rand.Reader, max)
 20 | 	x := bigx.Int64()
 21 | 	return x
 22 | }
 23 | 
 24 | func MakeClerk(servers []*labrpc.ClientEnd) *Clerk {
 25 | 	ck := new(Clerk)
 26 | 	ck.servers = servers
 27 | 	// Your code here.
 28 | 	return ck
 29 | }
 30 | 
 31 | func (ck *Clerk) Query(num int) Config {
 32 | 	args := &QueryArgs{}
 33 | 	// Your code here.
 34 | 	args.Num = num
 35 | 	for {
 36 | 		// try each known server.
 37 | 		for _, srv := range ck.servers {
 38 | 			var reply QueryReply
 39 | 			ok := srv.Call("ShardMaster.Query", args, &reply)
 40 | 			if ok && reply.WrongLeader == false {
 41 | 				return reply.Config
 42 | 			}
 43 | 		}
 44 | 		time.Sleep(100 * time.Millisecond)
 45 | 	}
 46 | }
 47 | 
 48 | func (ck *Clerk) Join(servers map[int][]string) {
 49 | 	args := &JoinArgs{}
 50 | 	// Your code here.
 51 | 	args.Servers = servers
 52 | 
 53 | 	for {
 54 | 		// try each known server.
 55 | 		for _, srv := range ck.servers {
 56 | 			var reply JoinReply
 57 | 			ok := srv.Call("ShardMaster.Join", args, &reply)
 58 | 			if ok && reply.WrongLeader == false {
 59 | 				return
 60 | 			}
 61 | 		}
 62 | 		time.Sleep(100 * time.Millisecond)
 63 | 	}
 64 | }
 65 | 
 66 | func (ck *Clerk) Leave(gids []int) {
 67 | 	args := &LeaveArgs{}
 68 | 	// Your code here.
 69 | 	args.GIDs = gids
 70 | 
 71 | 	for {
 72 | 		// try each known server.
 73 | 		for _, srv := range ck.servers {
 74 | 			var reply LeaveReply
 75 | 			ok := srv.Call("ShardMaster.Leave", args, &reply)
 76 | 			if ok && reply.WrongLeader == false {
 77 | 				return
 78 | 			}
 79 | 		}
 80 | 		time.Sleep(100 * time.Millisecond)
 81 | 	}
 82 | }
 83 | 
 84 | func (ck *Clerk) Move(shard int, gid int) {
 85 | 	args := &MoveArgs{}
 86 | 	// Your code here.
 87 | 	args.Shard = shard
 88 | 	args.GID = gid
 89 | 
 90 | 	for {
 91 | 		// try each known server.
 92 | 		for _, srv := range ck.servers {
 93 | 			var reply MoveReply
 94 | 			ok := srv.Call("ShardMaster.Move", args, &reply)
 95 | 			if ok && reply.WrongLeader == false {
 96 | 				return
 97 | 			}
 98 | 		}
 99 | 		time.Sleep(100 * time.Millisecond)
100 | 	}
101 | }
102 | 


--------------------------------------------------------------------------------
/src/shardmaster/common.go:
--------------------------------------------------------------------------------
 1 | package shardmaster
 2 | 
 3 | //
 4 | // Master shard server: assigns shards to replication groups.
 5 | //
 6 | // RPC interface:
 7 | // Join(servers) -- add a set of groups (gid -> server-list mapping).
 8 | // Leave(gids) -- delete a set of groups.
 9 | // Move(shard, gid) -- hand off one shard from current owner to gid.
10 | // Query(num) -> fetch Config # num, or latest config if num==-1.
11 | //
12 | // A Config (configuration) describes a set of replica groups, and the
13 | // replica group responsible for each shard. Configs are numbered. Config
14 | // #0 is the initial configuration, with no groups and all shards
15 | // assigned to group 0 (the invalid group).
16 | //
17 | // You will need to add fields to the RPC argument structs.
18 | //
19 | 
20 | // The number of shards.
21 | const NShards = 10
22 | 
23 | // A configuration -- an assignment of shards to groups.
24 | // Please don't change this.
25 | type Config struct {
26 | 	Num    int              // config number
27 | 	Shards [NShards]int     // shard -> gid
28 | 	Groups map[int][]string // gid -> servers[]
29 | }
30 | 
31 | const (
32 | 	OK = "OK"
33 | )
34 | 
35 | type Err string
36 | 
37 | type JoinArgs struct {
38 | 	Servers map[int][]string // new GID -> servers mappings
39 | }
40 | 
41 | type JoinReply struct {
42 | 	WrongLeader bool
43 | 	Err         Err
44 | }
45 | 
46 | type LeaveArgs struct {
47 | 	GIDs []int
48 | }
49 | 
50 | type LeaveReply struct {
51 | 	WrongLeader bool
52 | 	Err         Err
53 | }
54 | 
55 | type MoveArgs struct {
56 | 	Shard int
57 | 	GID   int
58 | }
59 | 
60 | type MoveReply struct {
61 | 	WrongLeader bool
62 | 	Err         Err
63 | }
64 | 
65 | type QueryArgs struct {
66 | 	Num int // desired config number
67 | }
68 | 
69 | type QueryReply struct {
70 | 	WrongLeader bool
71 | 	Err         Err
72 | 	Config      Config
73 | }
74 | 


--------------------------------------------------------------------------------
/src/shardmaster/config.go:
--------------------------------------------------------------------------------
  1 | package shardmaster
  2 | 
  3 | import "labrpc"
  4 | import "raft"
  5 | import "testing"
  6 | import "os"
  7 | 
  8 | // import "log"
  9 | import crand "crypto/rand"
 10 | import "math/rand"
 11 | import "encoding/base64"
 12 | import "sync"
 13 | import "runtime"
 14 | import "time"
 15 | 
 16 | func randstring(n int) string {
 17 | 	b := make([]byte, 2*n)
 18 | 	crand.Read(b)
 19 | 	s := base64.URLEncoding.EncodeToString(b)
 20 | 	return s[0:n]
 21 | }
 22 | 
 23 | // Randomize server handles
 24 | func random_handles(kvh []*labrpc.ClientEnd) []*labrpc.ClientEnd {
 25 | 	sa := make([]*labrpc.ClientEnd, len(kvh))
 26 | 	copy(sa, kvh)
 27 | 	for i := range sa {
 28 | 		j := rand.Intn(i + 1)
 29 | 		sa[i], sa[j] = sa[j], sa[i]
 30 | 	}
 31 | 	return sa
 32 | }
 33 | 
 34 | type config struct {
 35 | 	mu           sync.Mutex
 36 | 	t            *testing.T
 37 | 	net          *labrpc.Network
 38 | 	n            int
 39 | 	servers      []*ShardMaster
 40 | 	saved        []*raft.Persister
 41 | 	endnames     [][]string // names of each server's sending ClientEnds
 42 | 	clerks       map[*Clerk][]string
 43 | 	nextClientId int
 44 | 	start        time.Time // time at which make_config() was called
 45 | }
 46 | 
 47 | func (cfg *config) checkTimeout() {
 48 | 	// enforce a two minute real-time limit on each test
 49 | 	if !cfg.t.Failed() && time.Since(cfg.start) > 120*time.Second {
 50 | 		cfg.t.Fatal("test took longer than 120 seconds")
 51 | 	}
 52 | }
 53 | 
 54 | func (cfg *config) cleanup() {
 55 | 	cfg.mu.Lock()
 56 | 	defer cfg.mu.Unlock()
 57 | 	for i := 0; i < len(cfg.servers); i++ {
 58 | 		if cfg.servers[i] != nil {
 59 | 			cfg.servers[i].Kill()
 60 | 		}
 61 | 	}
 62 | 	cfg.net.Cleanup()
 63 | 	cfg.checkTimeout()
 64 | }
 65 | 
 66 | // Maximum log size across all servers
 67 | func (cfg *config) LogSize() int {
 68 | 	logsize := 0
 69 | 	for i := 0; i < cfg.n; i++ {
 70 | 		n := cfg.saved[i].RaftStateSize()
 71 | 		if n > logsize {
 72 | 			logsize = n
 73 | 		}
 74 | 	}
 75 | 	return logsize
 76 | }
 77 | 
 78 | // attach server i to servers listed in to
 79 | // caller must hold cfg.mu
 80 | func (cfg *config) connectUnlocked(i int, to []int) {
 81 | 	// log.Printf("connect peer %d to %v\n", i, to)
 82 | 
 83 | 	// outgoing socket files
 84 | 	for j := 0; j < len(to); j++ {
 85 | 		endname := cfg.endnames[i][to[j]]
 86 | 		cfg.net.Enable(endname, true)
 87 | 	}
 88 | 
 89 | 	// incoming socket files
 90 | 	for j := 0; j < len(to); j++ {
 91 | 		endname := cfg.endnames[to[j]][i]
 92 | 		cfg.net.Enable(endname, true)
 93 | 	}
 94 | }
 95 | 
 96 | func (cfg *config) connect(i int, to []int) {
 97 | 	cfg.mu.Lock()
 98 | 	defer cfg.mu.Unlock()
 99 | 	cfg.connectUnlocked(i, to)
100 | }
101 | 
102 | // detach server i from the servers listed in from
103 | // caller must hold cfg.mu
104 | func (cfg *config) disconnectUnlocked(i int, from []int) {
105 | 	// log.Printf("disconnect peer %d from %v\n", i, from)
106 | 
107 | 	// outgoing socket files
108 | 	for j := 0; j < len(from); j++ {
109 | 		if cfg.endnames[i] != nil {
110 | 			endname := cfg.endnames[i][from[j]]
111 | 			cfg.net.Enable(endname, false)
112 | 		}
113 | 	}
114 | 
115 | 	// incoming socket files
116 | 	for j := 0; j < len(from); j++ {
117 | 		if cfg.endnames[j] != nil {
118 | 			endname := cfg.endnames[from[j]][i]
119 | 			cfg.net.Enable(endname, false)
120 | 		}
121 | 	}
122 | }
123 | 
124 | func (cfg *config) disconnect(i int, from []int) {
125 | 	cfg.mu.Lock()
126 | 	defer cfg.mu.Unlock()
127 | 	cfg.disconnectUnlocked(i, from)
128 | }
129 | 
130 | func (cfg *config) All() []int {
131 | 	all := make([]int, cfg.n)
132 | 	for i := 0; i < cfg.n; i++ {
133 | 		all[i] = i
134 | 	}
135 | 	return all
136 | }
137 | 
138 | func (cfg *config) ConnectAll() {
139 | 	cfg.mu.Lock()
140 | 	defer cfg.mu.Unlock()
141 | 	for i := 0; i < cfg.n; i++ {
142 | 		cfg.connectUnlocked(i, cfg.All())
143 | 	}
144 | }
145 | 
146 | // Sets up 2 partitions with connectivity between servers in each  partition.
147 | func (cfg *config) partition(p1 []int, p2 []int) {
148 | 	cfg.mu.Lock()
149 | 	defer cfg.mu.Unlock()
150 | 	// log.Printf("partition servers into: %v %v\n", p1, p2)
151 | 	for i := 0; i < len(p1); i++ {
152 | 		cfg.disconnectUnlocked(p1[i], p2)
153 | 		cfg.connectUnlocked(p1[i], p1)
154 | 	}
155 | 	for i := 0; i < len(p2); i++ {
156 | 		cfg.disconnectUnlocked(p2[i], p1)
157 | 		cfg.connectUnlocked(p2[i], p2)
158 | 	}
159 | }
160 | 
161 | // Create a clerk with clerk specific server names.
162 | // Give it connections to all of the servers, but for
163 | // now enable only connections to servers in to[].
164 | func (cfg *config) makeClient(to []int) *Clerk {
165 | 	cfg.mu.Lock()
166 | 	defer cfg.mu.Unlock()
167 | 
168 | 	// a fresh set of ClientEnds.
169 | 	ends := make([]*labrpc.ClientEnd, cfg.n)
170 | 	endnames := make([]string, cfg.n)
171 | 	for j := 0; j < cfg.n; j++ {
172 | 		endnames[j] = randstring(20)
173 | 		ends[j] = cfg.net.MakeEnd(endnames[j])
174 | 		cfg.net.Connect(endnames[j], j)
175 | 	}
176 | 
177 | 	ck := MakeClerk(random_handles(ends))
178 | 	cfg.clerks[ck] = endnames
179 | 	cfg.nextClientId++
180 | 	cfg.ConnectClientUnlocked(ck, to)
181 | 	return ck
182 | }
183 | 
184 | func (cfg *config) deleteClient(ck *Clerk) {
185 | 	cfg.mu.Lock()
186 | 	defer cfg.mu.Unlock()
187 | 
188 | 	v := cfg.clerks[ck]
189 | 	for i := 0; i < len(v); i++ {
190 | 		os.Remove(v[i])
191 | 	}
192 | 	delete(cfg.clerks, ck)
193 | }
194 | 
195 | // caller should hold cfg.mu
196 | func (cfg *config) ConnectClientUnlocked(ck *Clerk, to []int) {
197 | 	// log.Printf("ConnectClient %v to %v\n", ck, to)
198 | 	endnames := cfg.clerks[ck]
199 | 	for j := 0; j < len(to); j++ {
200 | 		s := endnames[to[j]]
201 | 		cfg.net.Enable(s, true)
202 | 	}
203 | }
204 | 
205 | func (cfg *config) ConnectClient(ck *Clerk, to []int) {
206 | 	cfg.mu.Lock()
207 | 	defer cfg.mu.Unlock()
208 | 	cfg.ConnectClientUnlocked(ck, to)
209 | }
210 | 
211 | // caller should hold cfg.mu
212 | func (cfg *config) DisconnectClientUnlocked(ck *Clerk, from []int) {
213 | 	// log.Printf("DisconnectClient %v from %v\n", ck, from)
214 | 	endnames := cfg.clerks[ck]
215 | 	for j := 0; j < len(from); j++ {
216 | 		s := endnames[from[j]]
217 | 		cfg.net.Enable(s, false)
218 | 	}
219 | }
220 | 
221 | func (cfg *config) DisconnectClient(ck *Clerk, from []int) {
222 | 	cfg.mu.Lock()
223 | 	defer cfg.mu.Unlock()
224 | 	cfg.DisconnectClientUnlocked(ck, from)
225 | }
226 | 
227 | // Shutdown a server by isolating it
228 | func (cfg *config) ShutdownServer(i int) {
229 | 	cfg.mu.Lock()
230 | 	defer cfg.mu.Unlock()
231 | 
232 | 	cfg.disconnectUnlocked(i, cfg.All())
233 | 
234 | 	// disable client connections to the server.
235 | 	// it's important to do this before creating
236 | 	// the new Persister in saved[i], to avoid
237 | 	// the possibility of the server returning a
238 | 	// positive reply to an Append but persisting
239 | 	// the result in the superseded Persister.
240 | 	cfg.net.DeleteServer(i)
241 | 
242 | 	// a fresh persister, in case old instance
243 | 	// continues to update the Persister.
244 | 	// but copy old persister's content so that we always
245 | 	// pass Make() the last persisted state.
246 | 	if cfg.saved[i] != nil {
247 | 		cfg.saved[i] = cfg.saved[i].Copy()
248 | 	}
249 | 
250 | 	kv := cfg.servers[i]
251 | 	if kv != nil {
252 | 		cfg.mu.Unlock()
253 | 		kv.Kill()
254 | 		cfg.mu.Lock()
255 | 		cfg.servers[i] = nil
256 | 	}
257 | }
258 | 
259 | // If restart servers, first call ShutdownServer
260 | func (cfg *config) StartServer(i int) {
261 | 	cfg.mu.Lock()
262 | 
263 | 	// a fresh set of outgoing ClientEnd names.
264 | 	cfg.endnames[i] = make([]string, cfg.n)
265 | 	for j := 0; j < cfg.n; j++ {
266 | 		cfg.endnames[i][j] = randstring(20)
267 | 	}
268 | 
269 | 	// a fresh set of ClientEnds.
270 | 	ends := make([]*labrpc.ClientEnd, cfg.n)
271 | 	for j := 0; j < cfg.n; j++ {
272 | 		ends[j] = cfg.net.MakeEnd(cfg.endnames[i][j])
273 | 		cfg.net.Connect(cfg.endnames[i][j], j)
274 | 	}
275 | 
276 | 	// a fresh persister, so old instance doesn't overwrite
277 | 	// new instance's persisted state.
278 | 	// give the fresh persister a copy of the old persister's
279 | 	// state, so that the spec is that we pass StartKVServer()
280 | 	// the last persisted state.
281 | 	if cfg.saved[i] != nil {
282 | 		cfg.saved[i] = cfg.saved[i].Copy()
283 | 	} else {
284 | 		cfg.saved[i] = raft.MakePersister()
285 | 	}
286 | 
287 | 	cfg.mu.Unlock()
288 | 
289 | 	cfg.servers[i] = StartServer(ends, i, cfg.saved[i])
290 | 
291 | 	kvsvc := labrpc.MakeService(cfg.servers[i])
292 | 	rfsvc := labrpc.MakeService(cfg.servers[i].rf)
293 | 	srv := labrpc.MakeServer()
294 | 	srv.AddService(kvsvc)
295 | 	srv.AddService(rfsvc)
296 | 	cfg.net.AddServer(i, srv)
297 | }
298 | 
299 | func (cfg *config) Leader() (bool, int) {
300 | 	cfg.mu.Lock()
301 | 	defer cfg.mu.Unlock()
302 | 
303 | 	for i := 0; i < cfg.n; i++ {
304 | 		_, is_leader := cfg.servers[i].rf.GetState()
305 | 		if is_leader {
306 | 			return true, i
307 | 		}
308 | 	}
309 | 	return false, 0
310 | }
311 | 
312 | // Partition servers into 2 groups and put current leader in minority
313 | func (cfg *config) make_partition() ([]int, []int) {
314 | 	_, l := cfg.Leader()
315 | 	p1 := make([]int, cfg.n/2+1)
316 | 	p2 := make([]int, cfg.n/2)
317 | 	j := 0
318 | 	for i := 0; i < cfg.n; i++ {
319 | 		if i != l {
320 | 			if j < len(p1) {
321 | 				p1[j] = i
322 | 			} else {
323 | 				p2[j-len(p1)] = i
324 | 			}
325 | 			j++
326 | 		}
327 | 	}
328 | 	p2[len(p2)-1] = l
329 | 	return p1, p2
330 | }
331 | 
332 | func make_config(t *testing.T, n int, unreliable bool) *config {
333 | 	runtime.GOMAXPROCS(4)
334 | 	cfg := &config{}
335 | 	cfg.t = t
336 | 	cfg.net = labrpc.MakeNetwork()
337 | 	cfg.n = n
338 | 	cfg.servers = make([]*ShardMaster, cfg.n)
339 | 	cfg.saved = make([]*raft.Persister, cfg.n)
340 | 	cfg.endnames = make([][]string, cfg.n)
341 | 	cfg.clerks = make(map[*Clerk][]string)
342 | 	cfg.nextClientId = cfg.n + 1000 // client ids start 1000 above the highest serverid
343 | 	cfg.start = time.Now()
344 | 
345 | 	// create a full set of KV servers.
346 | 	for i := 0; i < cfg.n; i++ {
347 | 		cfg.StartServer(i)
348 | 	}
349 | 
350 | 	cfg.ConnectAll()
351 | 
352 | 	cfg.net.Reliable(!unreliable)
353 | 
354 | 	return cfg
355 | }
356 | 


--------------------------------------------------------------------------------
/src/shardmaster/server.go:
--------------------------------------------------------------------------------
 1 | package shardmaster
 2 | 
 3 | import "raft"
 4 | import "labrpc"
 5 | import "sync"
 6 | import "labgob"
 7 | 
 8 | type ShardMaster struct {
 9 | 	mu      sync.Mutex
10 | 	me      int
11 | 	rf      *raft.Raft
12 | 	applyCh chan raft.ApplyMsg
13 | 
14 | 	// Your data here.
15 | 
16 | 	configs []Config // indexed by config num
17 | }
18 | 
19 | type Op struct {
20 | 	// Your data here.
21 | }
22 | 
23 | func (sm *ShardMaster) Join(args *JoinArgs, reply *JoinReply) {
24 | 	// Your code here.
25 | }
26 | 
27 | func (sm *ShardMaster) Leave(args *LeaveArgs, reply *LeaveReply) {
28 | 	// Your code here.
29 | }
30 | 
31 | func (sm *ShardMaster) Move(args *MoveArgs, reply *MoveReply) {
32 | 	// Your code here.
33 | }
34 | 
35 | func (sm *ShardMaster) Query(args *QueryArgs, reply *QueryReply) {
36 | 	// Your code here.
37 | }
38 | 
39 | //
40 | // the tester calls Kill() when a ShardMaster instance won't
41 | // be needed again. you are not required to do anything
42 | // in Kill(), but it might be convenient to (for example)
43 | // turn off debug output from this instance.
44 | //
45 | func (sm *ShardMaster) Kill() {
46 | 	sm.rf.Kill()
47 | 	// Your code here, if desired.
48 | }
49 | 
50 | // needed by shardkv tester
51 | func (sm *ShardMaster) Raft() *raft.Raft {
52 | 	return sm.rf
53 | }
54 | 
55 | //
56 | // servers[] contains the ports of the set of
57 | // servers that will cooperate via Paxos to
58 | // form the fault-tolerant shardmaster service.
59 | // me is the index of the current server in servers[].
60 | //
61 | func StartServer(servers []*labrpc.ClientEnd, me int, persister *raft.Persister) *ShardMaster {
62 | 	sm := new(ShardMaster)
63 | 	sm.me = me
64 | 
65 | 	sm.configs = make([]Config, 1)
66 | 	sm.configs[0].Groups = map[int][]string{}
67 | 
68 | 	labgob.Register(Op{})
69 | 	sm.applyCh = make(chan raft.ApplyMsg)
70 | 	sm.rf = raft.Make(servers, me, persister, sm.applyCh)
71 | 
72 | 	// Your code here.
73 | 
74 | 	return sm
75 | }
76 | 


--------------------------------------------------------------------------------
/src/shardmaster/test_test.go:
--------------------------------------------------------------------------------
  1 | package shardmaster
  2 | 
  3 | import (
  4 | 	"sync"
  5 | 	"testing"
  6 | )
  7 | 
  8 | // import "time"
  9 | import "fmt"
 10 | 
 11 | func check(t *testing.T, groups []int, ck *Clerk) {
 12 | 	c := ck.Query(-1)
 13 | 	if len(c.Groups) != len(groups) {
 14 | 		t.Fatalf("wanted %v groups, got %v", len(groups), len(c.Groups))
 15 | 	}
 16 | 
 17 | 	// are the groups as expected?
 18 | 	for _, g := range groups {
 19 | 		_, ok := c.Groups[g]
 20 | 		if ok != true {
 21 | 			t.Fatalf("missing group %v", g)
 22 | 		}
 23 | 	}
 24 | 
 25 | 	// any un-allocated shards?
 26 | 	if len(groups) > 0 {
 27 | 		for s, g := range c.Shards {
 28 | 			_, ok := c.Groups[g]
 29 | 			if ok == false {
 30 | 				t.Fatalf("shard %v -> invalid group %v", s, g)
 31 | 			}
 32 | 		}
 33 | 	}
 34 | 
 35 | 	// more or less balanced sharding?
 36 | 	counts := map[int]int{}
 37 | 	for _, g := range c.Shards {
 38 | 		counts[g] += 1
 39 | 	}
 40 | 	min := 257
 41 | 	max := 0
 42 | 	for g, _ := range c.Groups {
 43 | 		if counts[g] > max {
 44 | 			max = counts[g]
 45 | 		}
 46 | 		if counts[g] < min {
 47 | 			min = counts[g]
 48 | 		}
 49 | 	}
 50 | 	if max > min+1 {
 51 | 		t.Fatalf("max %v too much larger than min %v", max, min)
 52 | 	}
 53 | }
 54 | 
 55 | func check_same_config(t *testing.T, c1 Config, c2 Config) {
 56 | 	if c1.Num != c2.Num {
 57 | 		t.Fatalf("Num wrong")
 58 | 	}
 59 | 	if c1.Shards != c2.Shards {
 60 | 		t.Fatalf("Shards wrong")
 61 | 	}
 62 | 	if len(c1.Groups) != len(c2.Groups) {
 63 | 		t.Fatalf("number of Groups is wrong")
 64 | 	}
 65 | 	for gid, sa := range c1.Groups {
 66 | 		sa1, ok := c2.Groups[gid]
 67 | 		if ok == false || len(sa1) != len(sa) {
 68 | 			t.Fatalf("len(Groups) wrong")
 69 | 		}
 70 | 		if ok && len(sa1) == len(sa) {
 71 | 			for j := 0; j < len(sa); j++ {
 72 | 				if sa[j] != sa1[j] {
 73 | 					t.Fatalf("Groups wrong")
 74 | 				}
 75 | 			}
 76 | 		}
 77 | 	}
 78 | }
 79 | 
 80 | func TestBasic(t *testing.T) {
 81 | 	const nservers = 3
 82 | 	cfg := make_config(t, nservers, false)
 83 | 	defer cfg.cleanup()
 84 | 
 85 | 	ck := cfg.makeClient(cfg.All())
 86 | 
 87 | 	fmt.Printf("Test: Basic leave/join ...\n")
 88 | 
 89 | 	cfa := make([]Config, 6)
 90 | 	cfa[0] = ck.Query(-1)
 91 | 
 92 | 	check(t, []int{}, ck)
 93 | 
 94 | 	var gid1 int = 1
 95 | 	ck.Join(map[int][]string{gid1: []string{"x", "y", "z"}})
 96 | 	check(t, []int{gid1}, ck)
 97 | 	cfa[1] = ck.Query(-1)
 98 | 
 99 | 	var gid2 int = 2
100 | 	ck.Join(map[int][]string{gid2: []string{"a", "b", "c"}})
101 | 	check(t, []int{gid1, gid2}, ck)
102 | 	cfa[2] = ck.Query(-1)
103 | 
104 | 	cfx := ck.Query(-1)
105 | 	sa1 := cfx.Groups[gid1]
106 | 	if len(sa1) != 3 || sa1[0] != "x" || sa1[1] != "y" || sa1[2] != "z" {
107 | 		t.Fatalf("wrong servers for gid %v: %v\n", gid1, sa1)
108 | 	}
109 | 	sa2 := cfx.Groups[gid2]
110 | 	if len(sa2) != 3 || sa2[0] != "a" || sa2[1] != "b" || sa2[2] != "c" {
111 | 		t.Fatalf("wrong servers for gid %v: %v\n", gid2, sa2)
112 | 	}
113 | 
114 | 	ck.Leave([]int{gid1})
115 | 	check(t, []int{gid2}, ck)
116 | 	cfa[4] = ck.Query(-1)
117 | 
118 | 	ck.Leave([]int{gid2})
119 | 	cfa[5] = ck.Query(-1)
120 | 
121 | 	fmt.Printf("  ... Passed\n")
122 | 
123 | 	fmt.Printf("Test: Historical queries ...\n")
124 | 
125 | 	for s := 0; s < nservers; s++ {
126 | 		cfg.ShutdownServer(s)
127 | 		for i := 0; i < len(cfa); i++ {
128 | 			c := ck.Query(cfa[i].Num)
129 | 			check_same_config(t, c, cfa[i])
130 | 		}
131 | 		cfg.StartServer(s)
132 | 		cfg.ConnectAll()
133 | 	}
134 | 
135 | 	fmt.Printf("  ... Passed\n")
136 | 
137 | 	fmt.Printf("Test: Move ...\n")
138 | 	{
139 | 		var gid3 int = 503
140 | 		ck.Join(map[int][]string{gid3: []string{"3a", "3b", "3c"}})
141 | 		var gid4 int = 504
142 | 		ck.Join(map[int][]string{gid4: []string{"4a", "4b", "4c"}})
143 | 		for i := 0; i < NShards; i++ {
144 | 			cf := ck.Query(-1)
145 | 			if i < NShards/2 {
146 | 				ck.Move(i, gid3)
147 | 				if cf.Shards[i] != gid3 {
148 | 					cf1 := ck.Query(-1)
149 | 					if cf1.Num <= cf.Num {
150 | 						t.Fatalf("Move should increase Config.Num")
151 | 					}
152 | 				}
153 | 			} else {
154 | 				ck.Move(i, gid4)
155 | 				if cf.Shards[i] != gid4 {
156 | 					cf1 := ck.Query(-1)
157 | 					if cf1.Num <= cf.Num {
158 | 						t.Fatalf("Move should increase Config.Num")
159 | 					}
160 | 				}
161 | 			}
162 | 		}
163 | 		cf2 := ck.Query(-1)
164 | 		for i := 0; i < NShards; i++ {
165 | 			if i < NShards/2 {
166 | 				if cf2.Shards[i] != gid3 {
167 | 					t.Fatalf("expected shard %v on gid %v actually %v",
168 | 						i, gid3, cf2.Shards[i])
169 | 				}
170 | 			} else {
171 | 				if cf2.Shards[i] != gid4 {
172 | 					t.Fatalf("expected shard %v on gid %v actually %v",
173 | 						i, gid4, cf2.Shards[i])
174 | 				}
175 | 			}
176 | 		}
177 | 		ck.Leave([]int{gid3})
178 | 		ck.Leave([]int{gid4})
179 | 	}
180 | 	fmt.Printf("  ... Passed\n")
181 | 
182 | 	fmt.Printf("Test: Concurrent leave/join ...\n")
183 | 
184 | 	const npara = 10
185 | 	var cka [npara]*Clerk
186 | 	for i := 0; i < len(cka); i++ {
187 | 		cka[i] = cfg.makeClient(cfg.All())
188 | 	}
189 | 	gids := make([]int, npara)
190 | 	ch := make(chan bool)
191 | 	for xi := 0; xi < npara; xi++ {
192 | 		gids[xi] = int((xi * 10) + 100)
193 | 		go func(i int) {
194 | 			defer func() { ch <- true }()
195 | 			var gid int = gids[i]
196 | 			var sid1 = fmt.Sprintf("s%da", gid)
197 | 			var sid2 = fmt.Sprintf("s%db", gid)
198 | 			cka[i].Join(map[int][]string{gid + 1000: []string{sid1}})
199 | 			cka[i].Join(map[int][]string{gid: []string{sid2}})
200 | 			cka[i].Leave([]int{gid + 1000})
201 | 		}(xi)
202 | 	}
203 | 	for i := 0; i < npara; i++ {
204 | 		<-ch
205 | 	}
206 | 	check(t, gids, ck)
207 | 
208 | 	fmt.Printf("  ... Passed\n")
209 | 
210 | 	fmt.Printf("Test: Minimal transfers after joins ...\n")
211 | 
212 | 	c1 := ck.Query(-1)
213 | 	for i := 0; i < 5; i++ {
214 | 		var gid = int(npara + 1 + i)
215 | 		ck.Join(map[int][]string{gid: []string{
216 | 			fmt.Sprintf("%da", gid),
217 | 			fmt.Sprintf("%db", gid),
218 | 			fmt.Sprintf("%db", gid)}})
219 | 	}
220 | 	c2 := ck.Query(-1)
221 | 	for i := int(1); i <= npara; i++ {
222 | 		for j := 0; j < len(c1.Shards); j++ {
223 | 			if c2.Shards[j] == i {
224 | 				if c1.Shards[j] != i {
225 | 					t.Fatalf("non-minimal transfer after Join()s")
226 | 				}
227 | 			}
228 | 		}
229 | 	}
230 | 
231 | 	fmt.Printf("  ... Passed\n")
232 | 
233 | 	fmt.Printf("Test: Minimal transfers after leaves ...\n")
234 | 
235 | 	for i := 0; i < 5; i++ {
236 | 		ck.Leave([]int{int(npara + 1 + i)})
237 | 	}
238 | 	c3 := ck.Query(-1)
239 | 	for i := int(1); i <= npara; i++ {
240 | 		for j := 0; j < len(c1.Shards); j++ {
241 | 			if c2.Shards[j] == i {
242 | 				if c3.Shards[j] != i {
243 | 					t.Fatalf("non-minimal transfer after Leave()s")
244 | 				}
245 | 			}
246 | 		}
247 | 	}
248 | 
249 | 	fmt.Printf("  ... Passed\n")
250 | }
251 | 
252 | func TestMulti(t *testing.T) {
253 | 	const nservers = 3
254 | 	cfg := make_config(t, nservers, false)
255 | 	defer cfg.cleanup()
256 | 
257 | 	ck := cfg.makeClient(cfg.All())
258 | 
259 | 	fmt.Printf("Test: Multi-group join/leave ...\n")
260 | 
261 | 	cfa := make([]Config, 6)
262 | 	cfa[0] = ck.Query(-1)
263 | 
264 | 	check(t, []int{}, ck)
265 | 
266 | 	var gid1 int = 1
267 | 	var gid2 int = 2
268 | 	ck.Join(map[int][]string{
269 | 		gid1: []string{"x", "y", "z"},
270 | 		gid2: []string{"a", "b", "c"},
271 | 	})
272 | 	check(t, []int{gid1, gid2}, ck)
273 | 	cfa[1] = ck.Query(-1)
274 | 
275 | 	var gid3 int = 3
276 | 	ck.Join(map[int][]string{gid3: []string{"j", "k", "l"}})
277 | 	check(t, []int{gid1, gid2, gid3}, ck)
278 | 	cfa[2] = ck.Query(-1)
279 | 
280 | 	cfx := ck.Query(-1)
281 | 	sa1 := cfx.Groups[gid1]
282 | 	if len(sa1) != 3 || sa1[0] != "x" || sa1[1] != "y" || sa1[2] != "z" {
283 | 		t.Fatalf("wrong servers for gid %v: %v\n", gid1, sa1)
284 | 	}
285 | 	sa2 := cfx.Groups[gid2]
286 | 	if len(sa2) != 3 || sa2[0] != "a" || sa2[1] != "b" || sa2[2] != "c" {
287 | 		t.Fatalf("wrong servers for gid %v: %v\n", gid2, sa2)
288 | 	}
289 | 	sa3 := cfx.Groups[gid3]
290 | 	if len(sa3) != 3 || sa3[0] != "j" || sa3[1] != "k" || sa3[2] != "l" {
291 | 		t.Fatalf("wrong servers for gid %v: %v\n", gid3, sa3)
292 | 	}
293 | 
294 | 	ck.Leave([]int{gid1, gid3})
295 | 	check(t, []int{gid2}, ck)
296 | 	cfa[3] = ck.Query(-1)
297 | 
298 | 	cfx = ck.Query(-1)
299 | 	sa2 = cfx.Groups[gid2]
300 | 	if len(sa2) != 3 || sa2[0] != "a" || sa2[1] != "b" || sa2[2] != "c" {
301 | 		t.Fatalf("wrong servers for gid %v: %v\n", gid2, sa2)
302 | 	}
303 | 
304 | 	ck.Leave([]int{gid2})
305 | 
306 | 	fmt.Printf("  ... Passed\n")
307 | 
308 | 	fmt.Printf("Test: Concurrent multi leave/join ...\n")
309 | 
310 | 	const npara = 10
311 | 	var cka [npara]*Clerk
312 | 	for i := 0; i < len(cka); i++ {
313 | 		cka[i] = cfg.makeClient(cfg.All())
314 | 	}
315 | 	gids := make([]int, npara)
316 | 	var wg sync.WaitGroup
317 | 	for xi := 0; xi < npara; xi++ {
318 | 		wg.Add(1)
319 | 		gids[xi] = int(xi + 1000)
320 | 		go func(i int) {
321 | 			defer wg.Done()
322 | 			var gid int = gids[i]
323 | 			cka[i].Join(map[int][]string{
324 | 				gid: []string{
325 | 					fmt.Sprintf("%da", gid),
326 | 					fmt.Sprintf("%db", gid),
327 | 					fmt.Sprintf("%dc", gid)},
328 | 				gid + 1000: []string{fmt.Sprintf("%da", gid+1000)},
329 | 				gid + 2000: []string{fmt.Sprintf("%da", gid+2000)},
330 | 			})
331 | 			cka[i].Leave([]int{gid + 1000, gid + 2000})
332 | 		}(xi)
333 | 	}
334 | 	wg.Wait()
335 | 	check(t, gids, ck)
336 | 
337 | 	fmt.Printf("  ... Passed\n")
338 | 
339 | 	fmt.Printf("Test: Minimal transfers after multijoins ...\n")
340 | 
341 | 	c1 := ck.Query(-1)
342 | 	m := make(map[int][]string)
343 | 	for i := 0; i < 5; i++ {
344 | 		var gid = npara + 1 + i
345 | 		m[gid] = []string{fmt.Sprintf("%da", gid), fmt.Sprintf("%db", gid)}
346 | 	}
347 | 	ck.Join(m)
348 | 	c2 := ck.Query(-1)
349 | 	for i := int(1); i <= npara; i++ {
350 | 		for j := 0; j < len(c1.Shards); j++ {
351 | 			if c2.Shards[j] == i {
352 | 				if c1.Shards[j] != i {
353 | 					t.Fatalf("non-minimal transfer after Join()s")
354 | 				}
355 | 			}
356 | 		}
357 | 	}
358 | 
359 | 	fmt.Printf("  ... Passed\n")
360 | 
361 | 	fmt.Printf("Test: Minimal transfers after multileaves ...\n")
362 | 
363 | 	var l []int
364 | 	for i := 0; i < 5; i++ {
365 | 		l = append(l, npara+1+i)
366 | 	}
367 | 	ck.Leave(l)
368 | 	c3 := ck.Query(-1)
369 | 	for i := int(1); i <= npara; i++ {
370 | 		for j := 0; j < len(c1.Shards); j++ {
371 | 			if c2.Shards[j] == i {
372 | 				if c3.Shards[j] != i {
373 | 					t.Fatalf("non-minimal transfer after Leave()s")
374 | 				}
375 | 			}
376 | 		}
377 | 	}
378 | 
379 | 	fmt.Printf("  ... Passed\n")
380 | }
381 | 


--------------------------------------------------------------------------------