├── .gitignore ├── MAPREDUCE.md ├── Makefile ├── RAFT.md ├── README.md ├── resource └── mr.png └── src ├── .gitignore ├── kvraft ├── client.go ├── common.go ├── config.go ├── server.go └── test_test.go ├── labgob ├── labgob.go └── test_test.go ├── labrpc ├── labrpc.go └── test_test.go ├── linearizability ├── bitset.go ├── linearizability.go ├── model.go └── models.go ├── main ├── diskvd.go ├── lockc.go ├── lockd.go ├── mrmaster.go ├── mrsequential.go ├── mrworker.go ├── pbc.go ├── pbd.go ├── pg-being_ernest.txt ├── pg-dorian_gray.txt ├── pg-frankenstein.txt ├── pg-grimm.txt ├── pg-huckleberry_finn.txt ├── pg-metamorphosis.txt ├── pg-sherlock_holmes.txt ├── pg-tom_sawyer.txt ├── test-mr.sh └── viewd.go ├── mr ├── master.go ├── rpc.go └── worker.go ├── mrapps ├── crash.go ├── indexer.go ├── mtiming.go ├── nocrash.go ├── rtiming.go └── wc.go ├── mroriginal ├── master.go ├── rpc.go └── worker.go ├── raft ├── config.go ├── persister.go ├── raft.go ├── test_test.go └── util.go ├── shardkv ├── client.go ├── common.go ├── config.go ├── server.go └── test_test.go └── shardmaster ├── client.go ├── common.go ├── config.go ├── server.go └── test_test.go /.gitignore: -------------------------------------------------------------------------------- 1 | pkg/ 2 | api.key 3 | *-handin.tar.gz 4 | .idea 5 | src/main/test-mr.sh -------------------------------------------------------------------------------- /MAPREDUCE.md: -------------------------------------------------------------------------------- 1 | # Lab1 实现一个MapReduce框架 2 | ## 0.运行一个顺序执行的 **mapreduce** 例子体验一下程序逻辑 3 | ```bash 4 | cd src/main 5 | // 将用户自定义的函数编译为动态连接库来执行 6 | go build -buildmode=plugin ../mrapps/wc.go 7 | rm mr-out* 8 | go run mrsequential.go wc.so pg*.txt 9 | more mr-out-0 10 | ``` 11 | ## 1. 如何启动一个分布式的 **mapreduce**? 12 | ### 启动一个 master 13 | ```bash 14 | cd src/main 15 | go build -buildmode=plugin ../mrapps/wc.go 16 | rm mr-out* 17 | go run mrmaster.go pg-*.txt 18 | ``` 19 | ### 启动多个 worker 20 | 打开多个终端 进入到项目的`main`下执行`go run mrworker.go wc.so` 21 | 22 | ## 2. 完成实验 23 | ### 编码 24 | 1. 在 `src/mr` 目录下完成实验内容 25 | 2. `mrmaster.go`会调用你编写的 `src/mr/master.go`代码 26 | 3. `mrworker.go`会调用`src/mr/worker.go`的代码 27 | 4. 二者通信的代码在`src/mr/rpc.go`中自行实现 28 | 5. 我已经完成了实验代码,在`src/mroriginal`目录下有原始的实验框架是我留给你来独立完成的 29 | 6. 将mr文件夹进行覆盖即可:`mv src/mroriginal src/mr` 或者备份下留作参考😁 30 | 31 | **ps:在`src/mrapps`目录下的是**MR**相关的应用函数** 32 | 33 | ### 调试 34 | 通过与`cat mr-out-* | sort` 输出的结果来对比可以检查你编写的MR框架是否运行正确 35 | 36 | ### 测试 37 | 在 `src/main`目录下执行 `sh test-mr.sh`由于master程序默认永远不退出,脚本执行后注意 `killall mr*` 释放掉资源. 38 | 39 | ## 更多Lab1实验的内容 40 | [请点击这里](http://nil.csail.mit.edu/6.824/2020/labs/lab-mr.html) 41 | 42 | ## 通过截图 43 | ![mr](resource/mr.png) 44 | 45 | ## 总结 46 | 1. 虽然通过了全部测试,但是感觉还有很多地方没有完善 47 | 2. 在设计之初没有思考清楚资源管理导致后来很难进行管理 48 | 3. 学到一个工程上的原则: 在设计之初解决必要的问题才是高效产出的关键 49 | 50 | ## TODO 51 | - [ ] 完成job后合理的释放资源 52 | 53 | - [ ] worker与master的通信异常状态处理 54 | 55 | - [ ] 优雅关闭整个集群 56 | - [ ] 在关闭集群的时候要合理释放job依赖资源以及 57 | - [ ] 当job完成时应拒绝worker任务的注册与获取任务请求 58 | 59 | - [ ] 发展为生产级别的嵌入式MapReduce计算框架 60 | - [ ] master 支持同时运行多个不同的job worker 61 | - [ ] 支持在异步网络下更加健壮的传输数据 62 | 63 | - [ ] 保证Map任务提交的原子性(产生map中间文件时,无法保证其幂等性的消息提交以及文件生成) 64 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # This is the Makefile helping you submit the labs. 2 | # Just create 6.824/api.key with your API key in it, 3 | # and submit your lab with the following command: 4 | # $ make [lab1|lab2a|lab2b|lab2c|lab3a|lab3b|lab4a|lab4b] 5 | 6 | LABS=" lab1 lab2a lab2b lab2c lab3a lab3b lab4a lab4b " 7 | 8 | %: 9 | @echo "Preparing $@-handin.tar.gz" 10 | @echo "Checking for committed temporary files..." 11 | @if git ls-files | grep -E 'mr-tmp|mrinput' > /dev/null; then \ 12 | echo "" ; \ 13 | echo "OBS! You have committed some large temporary files:" ; \ 14 | echo "" ; \ 15 | git ls-files | grep -E 'mr-tmp|mrinput' | sed 's/^/\t/' ; \ 16 | echo "" ; \ 17 | echo "Follow the instructions at http://stackoverflow.com/a/308684/472927" ; \ 18 | echo "to remove them, and then run make again." ; \ 19 | echo "" ; \ 20 | exit 1 ; \ 21 | fi 22 | @if echo $(LABS) | grep -q " $@ " ; then \ 23 | echo "Tarring up your submission..." ; \ 24 | tar cvzf $@-handin.tar.gz \ 25 | "--exclude=src/main/pg-*.txt" \ 26 | "--exclude=src/main/diskvd" \ 27 | "--exclude=src/mapreduce/824-mrinput-*.txt" \ 28 | "--exclude=src/main/mr-*" \ 29 | "--exclude=mrtmp.*" \ 30 | "--exclude=src/main/diff.out" \ 31 | Makefile src; \ 32 | if ! test -e api.key ; then \ 33 | echo "Missing $(PWD)/api.key. Please create the file with your key in it or submit the $@-handin.tar.gz via the web interface."; \ 34 | else \ 35 | echo "Are you sure you want to submit $@? Enter 'yes' to continue:"; \ 36 | read line; \ 37 | if test "$$line" != "yes" ; then echo "Giving up submission"; exit; fi; \ 38 | if test `stat -c "%s" "$@-handin.tar.gz" 2>/dev/null || stat -f "%z" "$@-handin.tar.gz"` -ge 20971520 ; then echo "File exceeds 20MB."; exit; fi; \ 39 | mv api.key api.key.fix ; \ 40 | cat api.key.fix | tr -d '\n' > api.key ; \ 41 | rm api.key.fix ; \ 42 | curl -F file=@$@-handin.tar.gz -F "key= /dev/null || { \ 44 | echo ; \ 45 | echo "Submit seems to have failed."; \ 46 | echo "Please upload the tarball manually on the submission website."; } \ 47 | fi; \ 48 | else \ 49 | echo "Bad target $@. Usage: make [$(LABS)]"; \ 50 | fi 51 | -------------------------------------------------------------------------------- /RAFT.md: -------------------------------------------------------------------------------- 1 | ## lab 2A 实现 raft 的选举与心跳 2 | 耗时 13小时完成(每天做实验1小时,思路分散还是影响进度的) 3 | 耗时的主要原因在于`分析问题不够清晰`, 主要遇到的问题有: 4 | 1. 心跳时间与选举超时时间的比重调整的不好 5 | 2. 集群角色的变更没有调度好,在一些关键行为后没有对角色正确的设置 6 | 3. 太过于自信,应该先写注释在写代码,必须全面逻辑通顺了再写代码,这样才能减少bug的出现提高效率,以后要一口气把逻辑用注释写好后再开发 7 | 8 | ### 一些感觉没有完成的事情 9 | TODO: 10 | - [ ] 选举/心跳等过程 仅对raft的状态添加原子锁保证了状态更新的并发安全,但是没有保证多个状态之间的事务性可能存在潜在的风险(这可能是偶尔不AC的原因) 11 | - [ ] 偶尔还是有测试不通过的问题,我姑且认为是时间可用性的影响吧 今后实验遇到问题再修复 12 | 13 | ### 最后的总结 14 | 1. 最开始的设计 应该只考虑必要的事情,简化复杂度 才能专注重要的事情 15 | 2. 保证逻辑连贯性的方法 就是一口气把关键逻辑的注释先写好 抛开代码 用伪代码思考逻辑, 大部分的问题 都是逻辑问题 16 | 3. 问题通常很简单,只是没有遵循思考的章法 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 介绍 2 | 完成 mit 6.824 2020年(春)课程学习 lab1-4 实验记录,该项目仅改变了实验中的一些包的位置以便于在`$GOPATH/src`目录下`clone`此项目来正确的运行实验 3 | # 价值 4 | 该仓库旨在帮助学习6824的同学们快速搭建实验环境,并且提供了我完成实验的答案作为参考,欢迎同学们提供PR来完善不同时期的6824课程,方便中国学生们学习分布式技术。 5 | # 声明 6 | 仅作为个人学习使用,强烈建议同学们自己完成实验以学到更多,实验答案可以帮助同学们形成自己的思路,而非提供抄袭的可能!!! 7 | -------------------------------------------------------------------------------- /resource/mr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/logikoisto/Mit6824/820d278e1ff0dd86f74bd3929550c7d23291f6b2/resource/mr.png -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.*/ 2 | mrtmp.* 3 | 824-mrinput-*.txt 4 | /main/diff.out 5 | /mapreduce/x.txt 6 | /pbservice/x.txt 7 | /kvpaxos/x.txt 8 | /main/wc.so 9 | /main/mr-out-0 10 | -------------------------------------------------------------------------------- /src/kvraft/client.go: -------------------------------------------------------------------------------- 1 | package kvraft 2 | 3 | import "labrpc" 4 | import "crypto/rand" 5 | import "math/big" 6 | 7 | type Clerk struct { 8 | servers []*labrpc.ClientEnd 9 | // You will have to modify this struct. 10 | } 11 | 12 | func nrand() int64 { 13 | max := big.NewInt(int64(1) << 62) 14 | bigx, _ := rand.Int(rand.Reader, max) 15 | x := bigx.Int64() 16 | return x 17 | } 18 | 19 | func MakeClerk(servers []*labrpc.ClientEnd) *Clerk { 20 | ck := new(Clerk) 21 | ck.servers = servers 22 | // You'll have to add code here. 23 | return ck 24 | } 25 | 26 | // 27 | // fetch the current value for a key. 28 | // returns "" if the key does not exist. 29 | // keeps trying forever in the face of all other errors. 30 | // 31 | // you can send an RPC with code like this: 32 | // ok := ck.servers[i].Call("KVServer.Get", &args, &reply) 33 | // 34 | // the types of args and reply (including whether they are pointers) 35 | // must match the declared types of the RPC handler function's 36 | // arguments. and reply must be passed as a pointer. 37 | // 38 | func (ck *Clerk) Get(key string) string { 39 | 40 | // You will have to modify this function. 41 | return "" 42 | } 43 | 44 | // 45 | // shared by Put and Append. 46 | // 47 | // you can send an RPC with code like this: 48 | // ok := ck.servers[i].Call("KVServer.PutAppend", &args, &reply) 49 | // 50 | // the types of args and reply (including whether they are pointers) 51 | // must match the declared types of the RPC handler function's 52 | // arguments. and reply must be passed as a pointer. 53 | // 54 | func (ck *Clerk) PutAppend(key string, value string, op string) { 55 | // You will have to modify this function. 56 | } 57 | 58 | func (ck *Clerk) Put(key string, value string) { 59 | ck.PutAppend(key, value, "Put") 60 | } 61 | func (ck *Clerk) Append(key string, value string) { 62 | ck.PutAppend(key, value, "Append") 63 | } 64 | -------------------------------------------------------------------------------- /src/kvraft/common.go: -------------------------------------------------------------------------------- 1 | package kvraft 2 | 3 | const ( 4 | OK = "OK" 5 | ErrNoKey = "ErrNoKey" 6 | ErrWrongLeader = "ErrWrongLeader" 7 | ) 8 | 9 | type Err string 10 | 11 | // Put or Append 12 | type PutAppendArgs struct { 13 | Key string 14 | Value string 15 | Op string // "Put" or "Append" 16 | // You'll have to add definitions here. 17 | // Field names must start with capital letters, 18 | // otherwise RPC will break. 19 | } 20 | 21 | type PutAppendReply struct { 22 | Err Err 23 | } 24 | 25 | type GetArgs struct { 26 | Key string 27 | // You'll have to add definitions here. 28 | } 29 | 30 | type GetReply struct { 31 | Err Err 32 | Value string 33 | } 34 | -------------------------------------------------------------------------------- /src/kvraft/config.go: -------------------------------------------------------------------------------- 1 | package kvraft 2 | 3 | import "labrpc" 4 | import "testing" 5 | import "os" 6 | 7 | // import "log" 8 | import crand "crypto/rand" 9 | import "math/big" 10 | import "math/rand" 11 | import "encoding/base64" 12 | import "sync" 13 | import "runtime" 14 | import "raft" 15 | import "fmt" 16 | import "time" 17 | import "sync/atomic" 18 | 19 | func randstring(n int) string { 20 | b := make([]byte, 2*n) 21 | crand.Read(b) 22 | s := base64.URLEncoding.EncodeToString(b) 23 | return s[0:n] 24 | } 25 | 26 | func makeSeed() int64 { 27 | max := big.NewInt(int64(1) << 62) 28 | bigx, _ := crand.Int(crand.Reader, max) 29 | x := bigx.Int64() 30 | return x 31 | } 32 | 33 | // Randomize server handles 34 | func random_handles(kvh []*labrpc.ClientEnd) []*labrpc.ClientEnd { 35 | sa := make([]*labrpc.ClientEnd, len(kvh)) 36 | copy(sa, kvh) 37 | for i := range sa { 38 | j := rand.Intn(i + 1) 39 | sa[i], sa[j] = sa[j], sa[i] 40 | } 41 | return sa 42 | } 43 | 44 | type config struct { 45 | mu sync.Mutex 46 | t *testing.T 47 | net *labrpc.Network 48 | n int 49 | kvservers []*KVServer 50 | saved []*raft.Persister 51 | endnames [][]string // names of each server's sending ClientEnds 52 | clerks map[*Clerk][]string 53 | nextClientId int 54 | maxraftstate int 55 | start time.Time // time at which make_config() was called 56 | // begin()/end() statistics 57 | t0 time.Time // time at which test_test.go called cfg.begin() 58 | rpcs0 int // rpcTotal() at start of test 59 | ops int32 // number of clerk get/put/append method calls 60 | } 61 | 62 | func (cfg *config) checkTimeout() { 63 | // enforce a two minute real-time limit on each test 64 | if !cfg.t.Failed() && time.Since(cfg.start) > 120*time.Second { 65 | cfg.t.Fatal("test took longer than 120 seconds") 66 | } 67 | } 68 | 69 | func (cfg *config) cleanup() { 70 | cfg.mu.Lock() 71 | defer cfg.mu.Unlock() 72 | for i := 0; i < len(cfg.kvservers); i++ { 73 | if cfg.kvservers[i] != nil { 74 | cfg.kvservers[i].Kill() 75 | } 76 | } 77 | cfg.net.Cleanup() 78 | cfg.checkTimeout() 79 | } 80 | 81 | // Maximum log size across all servers 82 | func (cfg *config) LogSize() int { 83 | logsize := 0 84 | for i := 0; i < cfg.n; i++ { 85 | n := cfg.saved[i].RaftStateSize() 86 | if n > logsize { 87 | logsize = n 88 | } 89 | } 90 | return logsize 91 | } 92 | 93 | // Maximum snapshot size across all servers 94 | func (cfg *config) SnapshotSize() int { 95 | snapshotsize := 0 96 | for i := 0; i < cfg.n; i++ { 97 | n := cfg.saved[i].SnapshotSize() 98 | if n > snapshotsize { 99 | snapshotsize = n 100 | } 101 | } 102 | return snapshotsize 103 | } 104 | 105 | // attach server i to servers listed in to 106 | // caller must hold cfg.mu 107 | func (cfg *config) connectUnlocked(i int, to []int) { 108 | // log.Printf("connect peer %d to %v\n", i, to) 109 | 110 | // outgoing socket files 111 | for j := 0; j < len(to); j++ { 112 | endname := cfg.endnames[i][to[j]] 113 | cfg.net.Enable(endname, true) 114 | } 115 | 116 | // incoming socket files 117 | for j := 0; j < len(to); j++ { 118 | endname := cfg.endnames[to[j]][i] 119 | cfg.net.Enable(endname, true) 120 | } 121 | } 122 | 123 | func (cfg *config) connect(i int, to []int) { 124 | cfg.mu.Lock() 125 | defer cfg.mu.Unlock() 126 | cfg.connectUnlocked(i, to) 127 | } 128 | 129 | // detach server i from the servers listed in from 130 | // caller must hold cfg.mu 131 | func (cfg *config) disconnectUnlocked(i int, from []int) { 132 | // log.Printf("disconnect peer %d from %v\n", i, from) 133 | 134 | // outgoing socket files 135 | for j := 0; j < len(from); j++ { 136 | if cfg.endnames[i] != nil { 137 | endname := cfg.endnames[i][from[j]] 138 | cfg.net.Enable(endname, false) 139 | } 140 | } 141 | 142 | // incoming socket files 143 | for j := 0; j < len(from); j++ { 144 | if cfg.endnames[j] != nil { 145 | endname := cfg.endnames[from[j]][i] 146 | cfg.net.Enable(endname, false) 147 | } 148 | } 149 | } 150 | 151 | func (cfg *config) disconnect(i int, from []int) { 152 | cfg.mu.Lock() 153 | defer cfg.mu.Unlock() 154 | cfg.disconnectUnlocked(i, from) 155 | } 156 | 157 | func (cfg *config) All() []int { 158 | all := make([]int, cfg.n) 159 | for i := 0; i < cfg.n; i++ { 160 | all[i] = i 161 | } 162 | return all 163 | } 164 | 165 | func (cfg *config) ConnectAll() { 166 | cfg.mu.Lock() 167 | defer cfg.mu.Unlock() 168 | for i := 0; i < cfg.n; i++ { 169 | cfg.connectUnlocked(i, cfg.All()) 170 | } 171 | } 172 | 173 | // Sets up 2 partitions with connectivity between servers in each partition. 174 | func (cfg *config) partition(p1 []int, p2 []int) { 175 | cfg.mu.Lock() 176 | defer cfg.mu.Unlock() 177 | // log.Printf("partition servers into: %v %v\n", p1, p2) 178 | for i := 0; i < len(p1); i++ { 179 | cfg.disconnectUnlocked(p1[i], p2) 180 | cfg.connectUnlocked(p1[i], p1) 181 | } 182 | for i := 0; i < len(p2); i++ { 183 | cfg.disconnectUnlocked(p2[i], p1) 184 | cfg.connectUnlocked(p2[i], p2) 185 | } 186 | } 187 | 188 | // Create a clerk with clerk specific server names. 189 | // Give it connections to all of the servers, but for 190 | // now enable only connections to servers in to[]. 191 | func (cfg *config) makeClient(to []int) *Clerk { 192 | cfg.mu.Lock() 193 | defer cfg.mu.Unlock() 194 | 195 | // a fresh set of ClientEnds. 196 | ends := make([]*labrpc.ClientEnd, cfg.n) 197 | endnames := make([]string, cfg.n) 198 | for j := 0; j < cfg.n; j++ { 199 | endnames[j] = randstring(20) 200 | ends[j] = cfg.net.MakeEnd(endnames[j]) 201 | cfg.net.Connect(endnames[j], j) 202 | } 203 | 204 | ck := MakeClerk(random_handles(ends)) 205 | cfg.clerks[ck] = endnames 206 | cfg.nextClientId++ 207 | cfg.ConnectClientUnlocked(ck, to) 208 | return ck 209 | } 210 | 211 | func (cfg *config) deleteClient(ck *Clerk) { 212 | cfg.mu.Lock() 213 | defer cfg.mu.Unlock() 214 | 215 | v := cfg.clerks[ck] 216 | for i := 0; i < len(v); i++ { 217 | os.Remove(v[i]) 218 | } 219 | delete(cfg.clerks, ck) 220 | } 221 | 222 | // caller should hold cfg.mu 223 | func (cfg *config) ConnectClientUnlocked(ck *Clerk, to []int) { 224 | // log.Printf("ConnectClient %v to %v\n", ck, to) 225 | endnames := cfg.clerks[ck] 226 | for j := 0; j < len(to); j++ { 227 | s := endnames[to[j]] 228 | cfg.net.Enable(s, true) 229 | } 230 | } 231 | 232 | func (cfg *config) ConnectClient(ck *Clerk, to []int) { 233 | cfg.mu.Lock() 234 | defer cfg.mu.Unlock() 235 | cfg.ConnectClientUnlocked(ck, to) 236 | } 237 | 238 | // caller should hold cfg.mu 239 | func (cfg *config) DisconnectClientUnlocked(ck *Clerk, from []int) { 240 | // log.Printf("DisconnectClient %v from %v\n", ck, from) 241 | endnames := cfg.clerks[ck] 242 | for j := 0; j < len(from); j++ { 243 | s := endnames[from[j]] 244 | cfg.net.Enable(s, false) 245 | } 246 | } 247 | 248 | func (cfg *config) DisconnectClient(ck *Clerk, from []int) { 249 | cfg.mu.Lock() 250 | defer cfg.mu.Unlock() 251 | cfg.DisconnectClientUnlocked(ck, from) 252 | } 253 | 254 | // Shutdown a server by isolating it 255 | func (cfg *config) ShutdownServer(i int) { 256 | cfg.mu.Lock() 257 | defer cfg.mu.Unlock() 258 | 259 | cfg.disconnectUnlocked(i, cfg.All()) 260 | 261 | // disable client connections to the server. 262 | // it's important to do this before creating 263 | // the new Persister in saved[i], to avoid 264 | // the possibility of the server returning a 265 | // positive reply to an Append but persisting 266 | // the result in the superseded Persister. 267 | cfg.net.DeleteServer(i) 268 | 269 | // a fresh persister, in case old instance 270 | // continues to update the Persister. 271 | // but copy old persister's content so that we always 272 | // pass Make() the last persisted state. 273 | if cfg.saved[i] != nil { 274 | cfg.saved[i] = cfg.saved[i].Copy() 275 | } 276 | 277 | kv := cfg.kvservers[i] 278 | if kv != nil { 279 | cfg.mu.Unlock() 280 | kv.Kill() 281 | cfg.mu.Lock() 282 | cfg.kvservers[i] = nil 283 | } 284 | } 285 | 286 | // If restart servers, first call ShutdownServer 287 | func (cfg *config) StartServer(i int) { 288 | cfg.mu.Lock() 289 | 290 | // a fresh set of outgoing ClientEnd names. 291 | cfg.endnames[i] = make([]string, cfg.n) 292 | for j := 0; j < cfg.n; j++ { 293 | cfg.endnames[i][j] = randstring(20) 294 | } 295 | 296 | // a fresh set of ClientEnds. 297 | ends := make([]*labrpc.ClientEnd, cfg.n) 298 | for j := 0; j < cfg.n; j++ { 299 | ends[j] = cfg.net.MakeEnd(cfg.endnames[i][j]) 300 | cfg.net.Connect(cfg.endnames[i][j], j) 301 | } 302 | 303 | // a fresh persister, so old instance doesn't overwrite 304 | // new instance's persisted state. 305 | // give the fresh persister a copy of the old persister's 306 | // state, so that the spec is that we pass StartKVServer() 307 | // the last persisted state. 308 | if cfg.saved[i] != nil { 309 | cfg.saved[i] = cfg.saved[i].Copy() 310 | } else { 311 | cfg.saved[i] = raft.MakePersister() 312 | } 313 | cfg.mu.Unlock() 314 | 315 | cfg.kvservers[i] = StartKVServer(ends, i, cfg.saved[i], cfg.maxraftstate) 316 | 317 | kvsvc := labrpc.MakeService(cfg.kvservers[i]) 318 | rfsvc := labrpc.MakeService(cfg.kvservers[i].rf) 319 | srv := labrpc.MakeServer() 320 | srv.AddService(kvsvc) 321 | srv.AddService(rfsvc) 322 | cfg.net.AddServer(i, srv) 323 | } 324 | 325 | func (cfg *config) Leader() (bool, int) { 326 | cfg.mu.Lock() 327 | defer cfg.mu.Unlock() 328 | 329 | for i := 0; i < cfg.n; i++ { 330 | _, is_leader := cfg.kvservers[i].rf.GetState() 331 | if is_leader { 332 | return true, i 333 | } 334 | } 335 | return false, 0 336 | } 337 | 338 | // Partition servers into 2 groups and put current leader in minority 339 | func (cfg *config) make_partition() ([]int, []int) { 340 | _, l := cfg.Leader() 341 | p1 := make([]int, cfg.n/2+1) 342 | p2 := make([]int, cfg.n/2) 343 | j := 0 344 | for i := 0; i < cfg.n; i++ { 345 | if i != l { 346 | if j < len(p1) { 347 | p1[j] = i 348 | } else { 349 | p2[j-len(p1)] = i 350 | } 351 | j++ 352 | } 353 | } 354 | p2[len(p2)-1] = l 355 | return p1, p2 356 | } 357 | 358 | var ncpu_once sync.Once 359 | 360 | func make_config(t *testing.T, n int, unreliable bool, maxraftstate int) *config { 361 | ncpu_once.Do(func() { 362 | if runtime.NumCPU() < 2 { 363 | fmt.Printf("warning: only one CPU, which may conceal locking bugs\n") 364 | } 365 | rand.Seed(makeSeed()) 366 | }) 367 | runtime.GOMAXPROCS(4) 368 | cfg := &config{} 369 | cfg.t = t 370 | cfg.net = labrpc.MakeNetwork() 371 | cfg.n = n 372 | cfg.kvservers = make([]*KVServer, cfg.n) 373 | cfg.saved = make([]*raft.Persister, cfg.n) 374 | cfg.endnames = make([][]string, cfg.n) 375 | cfg.clerks = make(map[*Clerk][]string) 376 | cfg.nextClientId = cfg.n + 1000 // client ids start 1000 above the highest serverid 377 | cfg.maxraftstate = maxraftstate 378 | cfg.start = time.Now() 379 | 380 | // create a full set of KV servers. 381 | for i := 0; i < cfg.n; i++ { 382 | cfg.StartServer(i) 383 | } 384 | 385 | cfg.ConnectAll() 386 | 387 | cfg.net.Reliable(!unreliable) 388 | 389 | return cfg 390 | } 391 | 392 | func (cfg *config) rpcTotal() int { 393 | return cfg.net.GetTotalCount() 394 | } 395 | 396 | // start a Test. 397 | // print the Test message. 398 | // e.g. cfg.begin("Test (2B): RPC counts aren't too high") 399 | func (cfg *config) begin(description string) { 400 | fmt.Printf("%s ...\n", description) 401 | cfg.t0 = time.Now() 402 | cfg.rpcs0 = cfg.rpcTotal() 403 | atomic.StoreInt32(&cfg.ops, 0) 404 | } 405 | 406 | func (cfg *config) op() { 407 | atomic.AddInt32(&cfg.ops, 1) 408 | } 409 | 410 | // end a Test -- the fact that we got here means there 411 | // was no failure. 412 | // print the Passed message, 413 | // and some performance numbers. 414 | func (cfg *config) end() { 415 | cfg.checkTimeout() 416 | if cfg.t.Failed() == false { 417 | t := time.Since(cfg.t0).Seconds() // real time 418 | npeers := cfg.n // number of Raft peers 419 | nrpc := cfg.rpcTotal() - cfg.rpcs0 // number of RPC sends 420 | ops := atomic.LoadInt32(&cfg.ops) // number of clerk get/put/append calls 421 | 422 | fmt.Printf(" ... Passed --") 423 | fmt.Printf(" %4.1f %d %5d %4d\n", t, npeers, nrpc, ops) 424 | } 425 | } 426 | -------------------------------------------------------------------------------- /src/kvraft/server.go: -------------------------------------------------------------------------------- 1 | package kvraft 2 | 3 | import ( 4 | "labgob" 5 | "labrpc" 6 | "log" 7 | "raft" 8 | "sync" 9 | "sync/atomic" 10 | ) 11 | 12 | const Debug = 0 13 | 14 | func DPrintf(format string, a ...interface{}) (n int, err error) { 15 | if Debug > 0 { 16 | log.Printf(format, a...) 17 | } 18 | return 19 | } 20 | 21 | type Op struct { 22 | // Your definitions here. 23 | // Field names must start with capital letters, 24 | // otherwise RPC will break. 25 | } 26 | 27 | type KVServer struct { 28 | mu sync.Mutex 29 | me int 30 | rf *raft.Raft 31 | applyCh chan raft.ApplyMsg 32 | dead int32 // set by Kill() 33 | 34 | maxraftstate int // snapshot if log grows this big 35 | 36 | // Your definitions here. 37 | } 38 | 39 | func (kv *KVServer) Get(args *GetArgs, reply *GetReply) { 40 | // Your code here. 41 | } 42 | 43 | func (kv *KVServer) PutAppend(args *PutAppendArgs, reply *PutAppendReply) { 44 | // Your code here. 45 | } 46 | 47 | // 48 | // the tester calls Kill() when a KVServer instance won't 49 | // be needed again. for your convenience, we supply 50 | // code to set rf.dead (without needing a lock), 51 | // and a killed() method to test rf.dead in 52 | // long-running loops. you can also add your own 53 | // code to Kill(). you're not required to do anything 54 | // about this, but it may be convenient (for example) 55 | // to suppress debug output from a Kill()ed instance. 56 | // 57 | func (kv *KVServer) Kill() { 58 | atomic.StoreInt32(&kv.dead, 1) 59 | kv.rf.Kill() 60 | // Your code here, if desired. 61 | } 62 | 63 | func (kv *KVServer) killed() bool { 64 | z := atomic.LoadInt32(&kv.dead) 65 | return z == 1 66 | } 67 | 68 | // 69 | // servers[] contains the ports of the set of 70 | // servers that will cooperate via Raft to 71 | // form the fault-tolerant key/value service. 72 | // me is the index of the current server in servers[]. 73 | // the k/v server should store snapshots through the underlying Raft 74 | // implementation, which should call persister.SaveStateAndSnapshot() to 75 | // atomically save the Raft state along with the snapshot. 76 | // the k/v server should snapshot when Raft's saved state exceeds maxraftstate bytes, 77 | // in order to allow Raft to garbage-collect its log. if maxraftstate is -1, 78 | // you don't need to snapshot. 79 | // StartKVServer() must return quickly, so it should start goroutines 80 | // for any long-running work. 81 | // 82 | func StartKVServer(servers []*labrpc.ClientEnd, me int, persister *raft.Persister, maxraftstate int) *KVServer { 83 | // call labgob.Register on structures you want 84 | // Go's RPC library to marshall/unmarshall. 85 | labgob.Register(Op{}) 86 | 87 | kv := new(KVServer) 88 | kv.me = me 89 | kv.maxraftstate = maxraftstate 90 | 91 | // You may need initialization code here. 92 | 93 | kv.applyCh = make(chan raft.ApplyMsg) 94 | kv.rf = raft.Make(servers, me, persister, kv.applyCh) 95 | 96 | // You may need initialization code here. 97 | 98 | return kv 99 | } 100 | -------------------------------------------------------------------------------- /src/labgob/labgob.go: -------------------------------------------------------------------------------- 1 | package labgob 2 | 3 | // 4 | // trying to send non-capitalized fields over RPC produces a range of 5 | // misbehavior, including both mysterious incorrect computation and 6 | // outright crashes. so this wrapper around Go's encoding/gob warns 7 | // about non-capitalized field names. 8 | // 9 | 10 | import "encoding/gob" 11 | import "io" 12 | import "reflect" 13 | import "fmt" 14 | import "sync" 15 | import "unicode" 16 | import "unicode/utf8" 17 | 18 | var mu sync.Mutex 19 | var errorCount int // for TestCapital 20 | var checked map[reflect.Type]bool 21 | 22 | type LabEncoder struct { 23 | gob *gob.Encoder 24 | } 25 | 26 | func NewEncoder(w io.Writer) *LabEncoder { 27 | enc := &LabEncoder{} 28 | enc.gob = gob.NewEncoder(w) 29 | return enc 30 | } 31 | 32 | func (enc *LabEncoder) Encode(e interface{}) error { 33 | checkValue(e) 34 | return enc.gob.Encode(e) 35 | } 36 | 37 | func (enc *LabEncoder) EncodeValue(value reflect.Value) error { 38 | checkValue(value.Interface()) 39 | return enc.gob.EncodeValue(value) 40 | } 41 | 42 | type LabDecoder struct { 43 | gob *gob.Decoder 44 | } 45 | 46 | func NewDecoder(r io.Reader) *LabDecoder { 47 | dec := &LabDecoder{} 48 | dec.gob = gob.NewDecoder(r) 49 | return dec 50 | } 51 | 52 | func (dec *LabDecoder) Decode(e interface{}) error { 53 | checkValue(e) 54 | checkDefault(e) 55 | return dec.gob.Decode(e) 56 | } 57 | 58 | func Register(value interface{}) { 59 | checkValue(value) 60 | gob.Register(value) 61 | } 62 | 63 | func RegisterName(name string, value interface{}) { 64 | checkValue(value) 65 | gob.RegisterName(name, value) 66 | } 67 | 68 | func checkValue(value interface{}) { 69 | checkType(reflect.TypeOf(value)) 70 | } 71 | 72 | func checkType(t reflect.Type) { 73 | k := t.Kind() 74 | 75 | mu.Lock() 76 | // only complain once, and avoid recursion. 77 | if checked == nil { 78 | checked = map[reflect.Type]bool{} 79 | } 80 | if checked[t] { 81 | mu.Unlock() 82 | return 83 | } 84 | checked[t] = true 85 | mu.Unlock() 86 | 87 | switch k { 88 | case reflect.Struct: 89 | for i := 0; i < t.NumField(); i++ { 90 | f := t.Field(i) 91 | rune, _ := utf8.DecodeRuneInString(f.Name) 92 | if unicode.IsUpper(rune) == false { 93 | // ta da 94 | fmt.Printf("labgob error: lower-case field %v of %v in RPC or persist/snapshot will break your Raft\n", 95 | f.Name, t.Name()) 96 | mu.Lock() 97 | errorCount += 1 98 | mu.Unlock() 99 | } 100 | checkType(f.Type) 101 | } 102 | return 103 | case reflect.Slice, reflect.Array, reflect.Ptr: 104 | checkType(t.Elem()) 105 | return 106 | case reflect.Map: 107 | checkType(t.Elem()) 108 | checkType(t.Key()) 109 | return 110 | default: 111 | return 112 | } 113 | } 114 | 115 | // 116 | // warn if the value contains non-default values, 117 | // as it would if one sent an RPC but the reply 118 | // struct was already modified. if the RPC reply 119 | // contains default values, GOB won't overwrite 120 | // the non-default value. 121 | // 122 | func checkDefault(value interface{}) { 123 | if value == nil { 124 | return 125 | } 126 | checkDefault1(reflect.ValueOf(value), 1, "") 127 | } 128 | 129 | func checkDefault1(value reflect.Value, depth int, name string) { 130 | if depth > 3 { 131 | return 132 | } 133 | 134 | t := value.Type() 135 | k := t.Kind() 136 | 137 | switch k { 138 | case reflect.Struct: 139 | for i := 0; i < t.NumField(); i++ { 140 | vv := value.Field(i) 141 | name1 := t.Field(i).Name 142 | if name != "" { 143 | name1 = name + "." + name1 144 | } 145 | checkDefault1(vv, depth+1, name1) 146 | } 147 | return 148 | case reflect.Ptr: 149 | if value.IsNil() { 150 | return 151 | } 152 | checkDefault1(value.Elem(), depth+1, name) 153 | return 154 | case reflect.Bool, 155 | reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, 156 | reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, 157 | reflect.Uintptr, reflect.Float32, reflect.Float64, 158 | reflect.String: 159 | if reflect.DeepEqual(reflect.Zero(t).Interface(), value.Interface()) == false { 160 | mu.Lock() 161 | if errorCount < 1 { 162 | what := name 163 | if what == "" { 164 | what = t.Name() 165 | } 166 | // this warning typically arises if code re-uses the same RPC reply 167 | // variable for multiple RPC calls, or if code restores persisted 168 | // state into variable that already have non-default values. 169 | fmt.Printf("labgob warning: Decoding into a non-default variable/field %v may not work\n", 170 | what) 171 | } 172 | errorCount += 1 173 | mu.Unlock() 174 | } 175 | return 176 | } 177 | } 178 | -------------------------------------------------------------------------------- /src/labgob/test_test.go: -------------------------------------------------------------------------------- 1 | package labgob 2 | 3 | import "testing" 4 | 5 | import "bytes" 6 | 7 | type T1 struct { 8 | T1int0 int 9 | T1int1 int 10 | T1string0 string 11 | T1string1 string 12 | } 13 | 14 | type T2 struct { 15 | T2slice []T1 16 | T2map map[int]*T1 17 | T2t3 interface{} 18 | } 19 | 20 | type T3 struct { 21 | T3int999 int 22 | } 23 | 24 | // 25 | // test that we didn't break GOB. 26 | // 27 | func TestGOB(t *testing.T) { 28 | e0 := errorCount 29 | 30 | w := new(bytes.Buffer) 31 | 32 | Register(T3{}) 33 | 34 | { 35 | x0 := 0 36 | x1 := 1 37 | t1 := T1{} 38 | t1.T1int1 = 1 39 | t1.T1string1 = "6.824" 40 | t2 := T2{} 41 | t2.T2slice = []T1{T1{}, t1} 42 | t2.T2map = map[int]*T1{} 43 | t2.T2map[99] = &T1{1, 2, "x", "y"} 44 | t2.T2t3 = T3{999} 45 | 46 | e := NewEncoder(w) 47 | e.Encode(x0) 48 | e.Encode(x1) 49 | e.Encode(t1) 50 | e.Encode(t2) 51 | } 52 | data := w.Bytes() 53 | 54 | { 55 | var x0 int 56 | var x1 int 57 | var t1 T1 58 | var t2 T2 59 | 60 | r := bytes.NewBuffer(data) 61 | d := NewDecoder(r) 62 | if d.Decode(&x0) != nil || 63 | d.Decode(&x1) != nil || 64 | d.Decode(&t1) != nil || 65 | d.Decode(&t2) != nil { 66 | t.Fatalf("Decode failed") 67 | } 68 | 69 | if x0 != 0 { 70 | t.Fatalf("wrong x0 %v\n", x0) 71 | } 72 | if x1 != 1 { 73 | t.Fatalf("wrong x1 %v\n", x1) 74 | } 75 | if t1.T1int0 != 0 { 76 | t.Fatalf("wrong t1.T1int0 %v\n", t1.T1int0) 77 | } 78 | if t1.T1int1 != 1 { 79 | t.Fatalf("wrong t1.T1int1 %v\n", t1.T1int1) 80 | } 81 | if t1.T1string0 != "" { 82 | t.Fatalf("wrong t1.T1string0 %v\n", t1.T1string0) 83 | } 84 | if t1.T1string1 != "6.824" { 85 | t.Fatalf("wrong t1.T1string1 %v\n", t1.T1string1) 86 | } 87 | if len(t2.T2slice) != 2 { 88 | t.Fatalf("wrong t2.T2slice len %v\n", len(t2.T2slice)) 89 | } 90 | if t2.T2slice[1].T1int1 != 1 { 91 | t.Fatalf("wrong slice value\n") 92 | } 93 | if len(t2.T2map) != 1 { 94 | t.Fatalf("wrong t2.T2map len %v\n", len(t2.T2map)) 95 | } 96 | if t2.T2map[99].T1string1 != "y" { 97 | t.Fatalf("wrong map value\n") 98 | } 99 | t3 := (t2.T2t3).(T3) 100 | if t3.T3int999 != 999 { 101 | t.Fatalf("wrong t2.T2t3.T3int999\n") 102 | } 103 | } 104 | 105 | if errorCount != e0 { 106 | t.Fatalf("there were errors, but should not have been") 107 | } 108 | } 109 | 110 | type T4 struct { 111 | Yes int 112 | no int 113 | } 114 | 115 | // 116 | // make sure we check capitalization 117 | // labgob prints one warning during this test. 118 | // 119 | func TestCapital(t *testing.T) { 120 | e0 := errorCount 121 | 122 | v := []map[*T4]int{} 123 | 124 | w := new(bytes.Buffer) 125 | e := NewEncoder(w) 126 | e.Encode(v) 127 | data := w.Bytes() 128 | 129 | var v1 []map[T4]int 130 | r := bytes.NewBuffer(data) 131 | d := NewDecoder(r) 132 | d.Decode(&v1) 133 | 134 | if errorCount != e0+1 { 135 | t.Fatalf("failed to warn about lower-case field") 136 | } 137 | } 138 | 139 | // 140 | // check that we warn when someone sends a default value over 141 | // RPC but the target into which we're decoding holds a non-default 142 | // value, which GOB seems not to overwrite as you'd expect. 143 | // 144 | // labgob does not print a warning. 145 | // 146 | func TestDefault(t *testing.T) { 147 | e0 := errorCount 148 | 149 | type DD struct { 150 | X int 151 | } 152 | 153 | // send a default value... 154 | dd1 := DD{} 155 | 156 | w := new(bytes.Buffer) 157 | e := NewEncoder(w) 158 | e.Encode(dd1) 159 | data := w.Bytes() 160 | 161 | // and receive it into memory that already 162 | // holds non-default values. 163 | reply := DD{99} 164 | 165 | r := bytes.NewBuffer(data) 166 | d := NewDecoder(r) 167 | d.Decode(&reply) 168 | 169 | if errorCount != e0+1 { 170 | t.Fatalf("failed to warn about decoding into non-default value") 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /src/labrpc/labrpc.go: -------------------------------------------------------------------------------- 1 | package labrpc 2 | 3 | // 4 | // channel-based RPC, for 824 labs. 5 | // 6 | // simulates a network that can lose requests, lose replies, 7 | // delay messages, and entirely disconnect particular hosts. 8 | // 9 | // we will use the original labrpc.go to test your code for grading. 10 | // so, while you can modify this code to help you debug, please 11 | // test against the original before submitting. 12 | // 13 | // adapted from Go net/rpc/server.go. 14 | // 15 | // sends labgob-encoded values to ensure that RPCs 16 | // don't include references to program objects. 17 | // 18 | // net := MakeNetwork() -- holds network, clients, servers. 19 | // end := net.MakeEnd(endname) -- create a client end-point, to talk to one server. 20 | // net.AddServer(servername, server) -- adds a named server to network. 21 | // net.DeleteServer(servername) -- eliminate the named server. 22 | // net.Connect(endname, servername) -- connect a client to a server. 23 | // net.Enable(endname, enabled) -- enable/disable a client. 24 | // net.Reliable(bool) -- false means drop/delay messages 25 | // 26 | // end.Call("Raft.AppendEntries", &args, &reply) -- send an RPC, wait for reply. 27 | // the "Raft" is the name of the server struct to be called. 28 | // the "AppendEntries" is the name of the method to be called. 29 | // Call() returns true to indicate that the server executed the request 30 | // and the reply is valid. 31 | // Call() returns false if the network lost the request or reply 32 | // or the server is down. 33 | // It is OK to have multiple Call()s in progress at the same time on the 34 | // same ClientEnd. 35 | // Concurrent calls to Call() may be delivered to the server out of order, 36 | // since the network may re-order messages. 37 | // Call() is guaranteed to return (perhaps after a delay) *except* if the 38 | // handler function on the server side does not return. 39 | // the server RPC handler function must declare its args and reply arguments 40 | // as pointers, so that their types exactly match the types of the arguments 41 | // to Call(). 42 | // 43 | // srv := MakeServer() 44 | // srv.AddService(svc) -- a server can have multiple services, e.g. Raft and k/v 45 | // pass srv to net.AddServer() 46 | // 47 | // svc := MakeService(receiverObject) -- obj's methods will handle RPCs 48 | // much like Go's rpcs.Register() 49 | // pass svc to srv.AddService() 50 | // 51 | 52 | import "Mit6824/src/labgob" 53 | import "bytes" 54 | import "reflect" 55 | import "sync" 56 | import "log" 57 | import "strings" 58 | import "math/rand" 59 | import "time" 60 | import "sync/atomic" 61 | 62 | type reqMsg struct { 63 | endname interface{} // name of sending ClientEnd 64 | svcMeth string // e.g. "Raft.AppendEntries" 65 | argsType reflect.Type 66 | args []byte 67 | replyCh chan replyMsg 68 | } 69 | 70 | type replyMsg struct { 71 | ok bool 72 | reply []byte 73 | } 74 | 75 | type ClientEnd struct { 76 | endname interface{} // this end-point's name 77 | ch chan reqMsg // copy of Network.endCh 78 | done chan struct{} // closed when Network is cleaned up 79 | } 80 | 81 | // send an RPC, wait for the reply. 82 | // the return value indicates success; false means that 83 | // no reply was received from the server. 84 | func (e *ClientEnd) Call(svcMeth string, args interface{}, reply interface{}) bool { 85 | req := reqMsg{} 86 | req.endname = e.endname 87 | req.svcMeth = svcMeth 88 | req.argsType = reflect.TypeOf(args) 89 | req.replyCh = make(chan replyMsg) 90 | 91 | qb := new(bytes.Buffer) 92 | qe := labgob.NewEncoder(qb) 93 | qe.Encode(args) 94 | req.args = qb.Bytes() 95 | 96 | select { 97 | case e.ch <- req: 98 | // ok 99 | case <-e.done: 100 | return false 101 | } 102 | 103 | rep := <-req.replyCh 104 | if rep.ok { 105 | rb := bytes.NewBuffer(rep.reply) 106 | rd := labgob.NewDecoder(rb) 107 | if err := rd.Decode(reply); err != nil { 108 | log.Fatalf("ClientEnd.Call(): decode reply: %v\n", err) 109 | } 110 | return true 111 | } else { 112 | return false 113 | } 114 | } 115 | 116 | type Network struct { 117 | mu sync.Mutex 118 | reliable bool 119 | longDelays bool // pause a long time on send on disabled connection 120 | longReordering bool // sometimes delay replies a long time 121 | ends map[interface{}]*ClientEnd // ends, by name 122 | enabled map[interface{}]bool // by end name 123 | servers map[interface{}]*Server // servers, by name 124 | connections map[interface{}]interface{} // endname -> servername 125 | endCh chan reqMsg 126 | done chan struct{} // closed when Network is cleaned up 127 | count int32 // total RPC count, for statistics 128 | } 129 | 130 | func MakeNetwork() *Network { 131 | rn := &Network{} 132 | rn.reliable = true 133 | rn.ends = map[interface{}]*ClientEnd{} 134 | rn.enabled = map[interface{}]bool{} 135 | rn.servers = map[interface{}]*Server{} 136 | rn.connections = map[interface{}](interface{}){} 137 | rn.endCh = make(chan reqMsg) 138 | rn.done = make(chan struct{}) 139 | 140 | // single goroutine to handle all ClientEnd.Call()s 141 | go func() { 142 | for { 143 | select { 144 | case xreq := <-rn.endCh: 145 | atomic.AddInt32(&rn.count, 1) 146 | go rn.processReq(xreq) 147 | case <-rn.done: 148 | return 149 | } 150 | } 151 | }() 152 | 153 | return rn 154 | } 155 | 156 | func (rn *Network) Cleanup() { 157 | close(rn.done) 158 | } 159 | 160 | func (rn *Network) Reliable(yes bool) { 161 | rn.mu.Lock() 162 | defer rn.mu.Unlock() 163 | 164 | rn.reliable = yes 165 | } 166 | 167 | func (rn *Network) LongReordering(yes bool) { 168 | rn.mu.Lock() 169 | defer rn.mu.Unlock() 170 | 171 | rn.longReordering = yes 172 | } 173 | 174 | func (rn *Network) LongDelays(yes bool) { 175 | rn.mu.Lock() 176 | defer rn.mu.Unlock() 177 | 178 | rn.longDelays = yes 179 | } 180 | 181 | func (rn *Network) readEndnameInfo(endname interface{}) (enabled bool, 182 | servername interface{}, server *Server, reliable bool, longreordering bool, 183 | ) { 184 | rn.mu.Lock() 185 | defer rn.mu.Unlock() 186 | 187 | enabled = rn.enabled[endname] 188 | servername = rn.connections[endname] 189 | if servername != nil { 190 | server = rn.servers[servername] 191 | } 192 | reliable = rn.reliable 193 | longreordering = rn.longReordering 194 | return 195 | } 196 | 197 | func (rn *Network) isServerDead(endname interface{}, servername interface{}, server *Server) bool { 198 | rn.mu.Lock() 199 | defer rn.mu.Unlock() 200 | 201 | if rn.enabled[endname] == false || rn.servers[servername] != server { 202 | return true 203 | } 204 | return false 205 | } 206 | 207 | func (rn *Network) processReq(req reqMsg) { 208 | enabled, servername, server, reliable, longreordering := rn.readEndnameInfo(req.endname) 209 | 210 | if enabled && servername != nil && server != nil { 211 | if reliable == false { 212 | // short delay 213 | ms := (rand.Int() % 27) 214 | time.Sleep(time.Duration(ms) * time.Millisecond) 215 | } 216 | 217 | if reliable == false && (rand.Int()%1000) < 100 { 218 | // drop the request, return as if timeout 219 | req.replyCh <- replyMsg{false, nil} 220 | return 221 | } 222 | 223 | // execute the request (call the RPC handler). 224 | // in a separate thread so that we can periodically check 225 | // if the server has been killed and the RPC should get a 226 | // failure reply. 227 | ech := make(chan replyMsg) 228 | go func() { 229 | r := server.dispatch(req) 230 | ech <- r 231 | }() 232 | 233 | // wait for handler to return, 234 | // but stop waiting if DeleteServer() has been called, 235 | // and return an error. 236 | var reply replyMsg 237 | replyOK := false 238 | serverDead := false 239 | for replyOK == false && serverDead == false { 240 | select { 241 | case reply = <-ech: 242 | replyOK = true 243 | case <-time.After(100 * time.Millisecond): 244 | serverDead = rn.isServerDead(req.endname, servername, server) 245 | if serverDead { 246 | go func() { 247 | <-ech // drain channel to let the goroutine created earlier terminate 248 | }() 249 | } 250 | } 251 | } 252 | 253 | // do not reply if DeleteServer() has been called, i.e. 254 | // the server has been killed. this is needed to avoid 255 | // situation in which a client gets a positive reply 256 | // to an Append, but the server persisted the update 257 | // into the old Persister. config.go is careful to call 258 | // DeleteServer() before superseding the Persister. 259 | serverDead = rn.isServerDead(req.endname, servername, server) 260 | 261 | if replyOK == false || serverDead == true { 262 | // server was killed while we were waiting; return error. 263 | req.replyCh <- replyMsg{false, nil} 264 | } else if reliable == false && (rand.Int()%1000) < 100 { 265 | // drop the reply, return as if timeout 266 | req.replyCh <- replyMsg{false, nil} 267 | } else if longreordering == true && rand.Intn(900) < 600 { 268 | // delay the response for a while 269 | ms := 200 + rand.Intn(1+rand.Intn(2000)) 270 | // Russ points out that this timer arrangement will decrease 271 | // the number of goroutines, so that the race 272 | // detector is less likely to get upset. 273 | time.AfterFunc(time.Duration(ms)*time.Millisecond, func() { 274 | req.replyCh <- reply 275 | }) 276 | } else { 277 | req.replyCh <- reply 278 | } 279 | } else { 280 | // simulate no reply and eventual timeout. 281 | ms := 0 282 | if rn.longDelays { 283 | // let Raft tests check that leader doesn't send 284 | // RPCs synchronously. 285 | ms = (rand.Int() % 7000) 286 | } else { 287 | // many kv tests require the client to try each 288 | // server in fairly rapid succession. 289 | ms = (rand.Int() % 100) 290 | } 291 | time.AfterFunc(time.Duration(ms)*time.Millisecond, func() { 292 | req.replyCh <- replyMsg{false, nil} 293 | }) 294 | } 295 | 296 | } 297 | 298 | // create a client end-point. 299 | // start the thread that listens and delivers. 300 | func (rn *Network) MakeEnd(endname interface{}) *ClientEnd { 301 | rn.mu.Lock() 302 | defer rn.mu.Unlock() 303 | 304 | if _, ok := rn.ends[endname]; ok { 305 | log.Fatalf("MakeEnd: %v already exists\n", endname) 306 | } 307 | 308 | e := &ClientEnd{} 309 | e.endname = endname 310 | e.ch = rn.endCh 311 | e.done = rn.done 312 | rn.ends[endname] = e 313 | rn.enabled[endname] = false 314 | rn.connections[endname] = nil 315 | 316 | return e 317 | } 318 | 319 | func (rn *Network) AddServer(servername interface{}, rs *Server) { 320 | rn.mu.Lock() 321 | defer rn.mu.Unlock() 322 | 323 | rn.servers[servername] = rs 324 | } 325 | 326 | func (rn *Network) DeleteServer(servername interface{}) { 327 | rn.mu.Lock() 328 | defer rn.mu.Unlock() 329 | 330 | rn.servers[servername] = nil 331 | } 332 | 333 | // connect a ClientEnd to a server. 334 | // a ClientEnd can only be connected once in its lifetime. 335 | func (rn *Network) Connect(endname interface{}, servername interface{}) { 336 | rn.mu.Lock() 337 | defer rn.mu.Unlock() 338 | 339 | rn.connections[endname] = servername 340 | } 341 | 342 | // enable/disable a ClientEnd. 343 | func (rn *Network) Enable(endname interface{}, enabled bool) { 344 | rn.mu.Lock() 345 | defer rn.mu.Unlock() 346 | 347 | rn.enabled[endname] = enabled 348 | } 349 | 350 | // get a server's count of incoming RPCs. 351 | func (rn *Network) GetCount(servername interface{}) int { 352 | rn.mu.Lock() 353 | defer rn.mu.Unlock() 354 | 355 | svr := rn.servers[servername] 356 | return svr.GetCount() 357 | } 358 | 359 | func (rn *Network) GetTotalCount() int { 360 | x := atomic.LoadInt32(&rn.count) 361 | return int(x) 362 | } 363 | 364 | // 365 | // a server is a collection of services, all sharing 366 | // the same rpc dispatcher. so that e.g. both a Raft 367 | // and a k/v server can listen to the same rpc endpoint. 368 | // 369 | type Server struct { 370 | mu sync.Mutex 371 | services map[string]*Service 372 | count int // incoming RPCs 373 | } 374 | 375 | func MakeServer() *Server { 376 | rs := &Server{} 377 | rs.services = map[string]*Service{} 378 | return rs 379 | } 380 | 381 | func (rs *Server) AddService(svc *Service) { 382 | rs.mu.Lock() 383 | defer rs.mu.Unlock() 384 | rs.services[svc.name] = svc 385 | } 386 | 387 | func (rs *Server) dispatch(req reqMsg) replyMsg { 388 | rs.mu.Lock() 389 | 390 | rs.count += 1 391 | 392 | // split Raft.AppendEntries into service and method 393 | dot := strings.LastIndex(req.svcMeth, ".") 394 | serviceName := req.svcMeth[:dot] 395 | methodName := req.svcMeth[dot+1:] 396 | 397 | service, ok := rs.services[serviceName] 398 | 399 | rs.mu.Unlock() 400 | 401 | if ok { 402 | return service.dispatch(methodName, req) 403 | } else { 404 | choices := []string{} 405 | for k, _ := range rs.services { 406 | choices = append(choices, k) 407 | } 408 | log.Fatalf("labrpc.Server.dispatch(): unknown service %v in %v.%v; expecting one of %v\n", 409 | serviceName, serviceName, methodName, choices) 410 | return replyMsg{false, nil} 411 | } 412 | } 413 | 414 | func (rs *Server) GetCount() int { 415 | rs.mu.Lock() 416 | defer rs.mu.Unlock() 417 | return rs.count 418 | } 419 | 420 | // an object with methods that can be called via RPC. 421 | // a single server may have more than one Service. 422 | type Service struct { 423 | name string 424 | rcvr reflect.Value 425 | typ reflect.Type 426 | methods map[string]reflect.Method 427 | } 428 | 429 | func MakeService(rcvr interface{}) *Service { 430 | svc := &Service{} 431 | svc.typ = reflect.TypeOf(rcvr) 432 | svc.rcvr = reflect.ValueOf(rcvr) 433 | svc.name = reflect.Indirect(svc.rcvr).Type().Name() 434 | svc.methods = map[string]reflect.Method{} 435 | 436 | for m := 0; m < svc.typ.NumMethod(); m++ { 437 | method := svc.typ.Method(m) 438 | mtype := method.Type 439 | mname := method.Name 440 | 441 | //fmt.Printf("%v pp %v ni %v 1k %v 2k %v no %v\n", 442 | // mname, method.PkgPath, mtype.NumIn(), mtype.In(1).Kind(), mtype.In(2).Kind(), mtype.NumOut()) 443 | 444 | if method.PkgPath != "" || // capitalized? 445 | mtype.NumIn() != 3 || 446 | //mtype.In(1).Kind() != reflect.Ptr || 447 | mtype.In(2).Kind() != reflect.Ptr || 448 | mtype.NumOut() != 0 { 449 | // the method is not suitable for a handler 450 | //fmt.Printf("bad method: %v\n", mname) 451 | } else { 452 | // the method looks like a handler 453 | svc.methods[mname] = method 454 | } 455 | } 456 | 457 | return svc 458 | } 459 | 460 | func (svc *Service) dispatch(methname string, req reqMsg) replyMsg { 461 | if method, ok := svc.methods[methname]; ok { 462 | // prepare space into which to read the argument. 463 | // the Value's type will be a pointer to req.argsType. 464 | args := reflect.New(req.argsType) 465 | 466 | // decode the argument. 467 | ab := bytes.NewBuffer(req.args) 468 | ad := labgob.NewDecoder(ab) 469 | ad.Decode(args.Interface()) 470 | 471 | // allocate space for the reply. 472 | replyType := method.Type.In(2) 473 | replyType = replyType.Elem() 474 | replyv := reflect.New(replyType) 475 | 476 | // call the method. 477 | function := method.Func 478 | function.Call([]reflect.Value{svc.rcvr, args.Elem(), replyv}) 479 | 480 | // encode the reply. 481 | rb := new(bytes.Buffer) 482 | re := labgob.NewEncoder(rb) 483 | re.EncodeValue(replyv) 484 | 485 | return replyMsg{true, rb.Bytes()} 486 | } else { 487 | choices := []string{} 488 | for k, _ := range svc.methods { 489 | choices = append(choices, k) 490 | } 491 | log.Fatalf("labrpc.Service.dispatch(): unknown method %v in %v; expecting one of %v\n", 492 | methname, req.svcMeth, choices) 493 | return replyMsg{false, nil} 494 | } 495 | } 496 | -------------------------------------------------------------------------------- /src/labrpc/test_test.go: -------------------------------------------------------------------------------- 1 | package labrpc 2 | 3 | import "testing" 4 | import "strconv" 5 | import "sync" 6 | import "runtime" 7 | import "time" 8 | import "fmt" 9 | 10 | type JunkArgs struct { 11 | X int 12 | } 13 | type JunkReply struct { 14 | X string 15 | } 16 | 17 | type JunkServer struct { 18 | mu sync.Mutex 19 | log1 []string 20 | log2 []int 21 | } 22 | 23 | func (js *JunkServer) Handler1(args string, reply *int) { 24 | js.mu.Lock() 25 | defer js.mu.Unlock() 26 | js.log1 = append(js.log1, args) 27 | *reply, _ = strconv.Atoi(args) 28 | } 29 | 30 | func (js *JunkServer) Handler2(args int, reply *string) { 31 | js.mu.Lock() 32 | defer js.mu.Unlock() 33 | js.log2 = append(js.log2, args) 34 | *reply = "handler2-" + strconv.Itoa(args) 35 | } 36 | 37 | func (js *JunkServer) Handler3(args int, reply *int) { 38 | js.mu.Lock() 39 | defer js.mu.Unlock() 40 | time.Sleep(20 * time.Second) 41 | *reply = -args 42 | } 43 | 44 | // args is a pointer 45 | func (js *JunkServer) Handler4(args *JunkArgs, reply *JunkReply) { 46 | reply.X = "pointer" 47 | } 48 | 49 | // args is a not pointer 50 | func (js *JunkServer) Handler5(args JunkArgs, reply *JunkReply) { 51 | reply.X = "no pointer" 52 | } 53 | 54 | func TestBasic(t *testing.T) { 55 | runtime.GOMAXPROCS(4) 56 | 57 | rn := MakeNetwork() 58 | defer rn.Cleanup() 59 | 60 | e := rn.MakeEnd("end1-99") 61 | 62 | js := &JunkServer{} 63 | svc := MakeService(js) 64 | 65 | rs := MakeServer() 66 | rs.AddService(svc) 67 | rn.AddServer("server99", rs) 68 | 69 | rn.Connect("end1-99", "server99") 70 | rn.Enable("end1-99", true) 71 | 72 | { 73 | reply := "" 74 | e.Call("JunkServer.Handler2", 111, &reply) 75 | if reply != "handler2-111" { 76 | t.Fatalf("wrong reply from Handler2") 77 | } 78 | } 79 | 80 | { 81 | reply := 0 82 | e.Call("JunkServer.Handler1", "9099", &reply) 83 | if reply != 9099 { 84 | t.Fatalf("wrong reply from Handler1") 85 | } 86 | } 87 | } 88 | 89 | func TestTypes(t *testing.T) { 90 | runtime.GOMAXPROCS(4) 91 | 92 | rn := MakeNetwork() 93 | defer rn.Cleanup() 94 | 95 | e := rn.MakeEnd("end1-99") 96 | 97 | js := &JunkServer{} 98 | svc := MakeService(js) 99 | 100 | rs := MakeServer() 101 | rs.AddService(svc) 102 | rn.AddServer("server99", rs) 103 | 104 | rn.Connect("end1-99", "server99") 105 | rn.Enable("end1-99", true) 106 | 107 | { 108 | var args JunkArgs 109 | var reply JunkReply 110 | // args must match type (pointer or not) of handler. 111 | e.Call("JunkServer.Handler4", &args, &reply) 112 | if reply.X != "pointer" { 113 | t.Fatalf("wrong reply from Handler4") 114 | } 115 | } 116 | 117 | { 118 | var args JunkArgs 119 | var reply JunkReply 120 | // args must match type (pointer or not) of handler. 121 | e.Call("JunkServer.Handler5", args, &reply) 122 | if reply.X != "no pointer" { 123 | t.Fatalf("wrong reply from Handler5") 124 | } 125 | } 126 | } 127 | 128 | // 129 | // does net.Enable(endname, false) really disconnect a client? 130 | // 131 | func TestDisconnect(t *testing.T) { 132 | runtime.GOMAXPROCS(4) 133 | 134 | rn := MakeNetwork() 135 | defer rn.Cleanup() 136 | 137 | e := rn.MakeEnd("end1-99") 138 | 139 | js := &JunkServer{} 140 | svc := MakeService(js) 141 | 142 | rs := MakeServer() 143 | rs.AddService(svc) 144 | rn.AddServer("server99", rs) 145 | 146 | rn.Connect("end1-99", "server99") 147 | 148 | { 149 | reply := "" 150 | e.Call("JunkServer.Handler2", 111, &reply) 151 | if reply != "" { 152 | t.Fatalf("unexpected reply from Handler2") 153 | } 154 | } 155 | 156 | rn.Enable("end1-99", true) 157 | 158 | { 159 | reply := 0 160 | e.Call("JunkServer.Handler1", "9099", &reply) 161 | if reply != 9099 { 162 | t.Fatalf("wrong reply from Handler1") 163 | } 164 | } 165 | } 166 | 167 | // 168 | // test net.GetCount() 169 | // 170 | func TestCounts(t *testing.T) { 171 | runtime.GOMAXPROCS(4) 172 | 173 | rn := MakeNetwork() 174 | defer rn.Cleanup() 175 | 176 | e := rn.MakeEnd("end1-99") 177 | 178 | js := &JunkServer{} 179 | svc := MakeService(js) 180 | 181 | rs := MakeServer() 182 | rs.AddService(svc) 183 | rn.AddServer(99, rs) 184 | 185 | rn.Connect("end1-99", 99) 186 | rn.Enable("end1-99", true) 187 | 188 | for i := 0; i < 17; i++ { 189 | reply := "" 190 | e.Call("JunkServer.Handler2", i, &reply) 191 | wanted := "handler2-" + strconv.Itoa(i) 192 | if reply != wanted { 193 | t.Fatalf("wrong reply %v from Handler1, expecting %v", reply, wanted) 194 | } 195 | } 196 | 197 | n := rn.GetCount(99) 198 | if n != 17 { 199 | t.Fatalf("wrong GetCount() %v, expected 17\n", n) 200 | } 201 | } 202 | 203 | // 204 | // test RPCs from concurrent ClientEnds 205 | // 206 | func TestConcurrentMany(t *testing.T) { 207 | runtime.GOMAXPROCS(4) 208 | 209 | rn := MakeNetwork() 210 | defer rn.Cleanup() 211 | 212 | js := &JunkServer{} 213 | svc := MakeService(js) 214 | 215 | rs := MakeServer() 216 | rs.AddService(svc) 217 | rn.AddServer(1000, rs) 218 | 219 | ch := make(chan int) 220 | 221 | nclients := 20 222 | nrpcs := 10 223 | for ii := 0; ii < nclients; ii++ { 224 | go func(i int) { 225 | n := 0 226 | defer func() { ch <- n }() 227 | 228 | e := rn.MakeEnd(i) 229 | rn.Connect(i, 1000) 230 | rn.Enable(i, true) 231 | 232 | for j := 0; j < nrpcs; j++ { 233 | arg := i*100 + j 234 | reply := "" 235 | e.Call("JunkServer.Handler2", arg, &reply) 236 | wanted := "handler2-" + strconv.Itoa(arg) 237 | if reply != wanted { 238 | t.Fatalf("wrong reply %v from Handler1, expecting %v", reply, wanted) 239 | } 240 | n += 1 241 | } 242 | }(ii) 243 | } 244 | 245 | total := 0 246 | for ii := 0; ii < nclients; ii++ { 247 | x := <-ch 248 | total += x 249 | } 250 | 251 | if total != nclients*nrpcs { 252 | t.Fatalf("wrong number of RPCs completed, got %v, expected %v", total, nclients*nrpcs) 253 | } 254 | 255 | n := rn.GetCount(1000) 256 | if n != total { 257 | t.Fatalf("wrong GetCount() %v, expected %v\n", n, total) 258 | } 259 | } 260 | 261 | // 262 | // test unreliable 263 | // 264 | func TestUnreliable(t *testing.T) { 265 | runtime.GOMAXPROCS(4) 266 | 267 | rn := MakeNetwork() 268 | defer rn.Cleanup() 269 | rn.Reliable(false) 270 | 271 | js := &JunkServer{} 272 | svc := MakeService(js) 273 | 274 | rs := MakeServer() 275 | rs.AddService(svc) 276 | rn.AddServer(1000, rs) 277 | 278 | ch := make(chan int) 279 | 280 | nclients := 300 281 | for ii := 0; ii < nclients; ii++ { 282 | go func(i int) { 283 | n := 0 284 | defer func() { ch <- n }() 285 | 286 | e := rn.MakeEnd(i) 287 | rn.Connect(i, 1000) 288 | rn.Enable(i, true) 289 | 290 | arg := i * 100 291 | reply := "" 292 | ok := e.Call("JunkServer.Handler2", arg, &reply) 293 | if ok { 294 | wanted := "handler2-" + strconv.Itoa(arg) 295 | if reply != wanted { 296 | t.Fatalf("wrong reply %v from Handler1, expecting %v", reply, wanted) 297 | } 298 | n += 1 299 | } 300 | }(ii) 301 | } 302 | 303 | total := 0 304 | for ii := 0; ii < nclients; ii++ { 305 | x := <-ch 306 | total += x 307 | } 308 | 309 | if total == nclients || total == 0 { 310 | t.Fatalf("all RPCs succeeded despite unreliable") 311 | } 312 | } 313 | 314 | // 315 | // test concurrent RPCs from a single ClientEnd 316 | // 317 | func TestConcurrentOne(t *testing.T) { 318 | runtime.GOMAXPROCS(4) 319 | 320 | rn := MakeNetwork() 321 | defer rn.Cleanup() 322 | 323 | js := &JunkServer{} 324 | svc := MakeService(js) 325 | 326 | rs := MakeServer() 327 | rs.AddService(svc) 328 | rn.AddServer(1000, rs) 329 | 330 | e := rn.MakeEnd("c") 331 | rn.Connect("c", 1000) 332 | rn.Enable("c", true) 333 | 334 | ch := make(chan int) 335 | 336 | nrpcs := 20 337 | for ii := 0; ii < nrpcs; ii++ { 338 | go func(i int) { 339 | n := 0 340 | defer func() { ch <- n }() 341 | 342 | arg := 100 + i 343 | reply := "" 344 | e.Call("JunkServer.Handler2", arg, &reply) 345 | wanted := "handler2-" + strconv.Itoa(arg) 346 | if reply != wanted { 347 | t.Fatalf("wrong reply %v from Handler2, expecting %v", reply, wanted) 348 | } 349 | n += 1 350 | }(ii) 351 | } 352 | 353 | total := 0 354 | for ii := 0; ii < nrpcs; ii++ { 355 | x := <-ch 356 | total += x 357 | } 358 | 359 | if total != nrpcs { 360 | t.Fatalf("wrong number of RPCs completed, got %v, expected %v", total, nrpcs) 361 | } 362 | 363 | js.mu.Lock() 364 | defer js.mu.Unlock() 365 | if len(js.log2) != nrpcs { 366 | t.Fatalf("wrong number of RPCs delivered") 367 | } 368 | 369 | n := rn.GetCount(1000) 370 | if n != total { 371 | t.Fatalf("wrong GetCount() %v, expected %v\n", n, total) 372 | } 373 | } 374 | 375 | // 376 | // regression: an RPC that's delayed during Enabled=false 377 | // should not delay subsequent RPCs (e.g. after Enabled=true). 378 | // 379 | func TestRegression1(t *testing.T) { 380 | runtime.GOMAXPROCS(4) 381 | 382 | rn := MakeNetwork() 383 | defer rn.Cleanup() 384 | 385 | js := &JunkServer{} 386 | svc := MakeService(js) 387 | 388 | rs := MakeServer() 389 | rs.AddService(svc) 390 | rn.AddServer(1000, rs) 391 | 392 | e := rn.MakeEnd("c") 393 | rn.Connect("c", 1000) 394 | 395 | // start some RPCs while the ClientEnd is disabled. 396 | // they'll be delayed. 397 | rn.Enable("c", false) 398 | ch := make(chan bool) 399 | nrpcs := 20 400 | for ii := 0; ii < nrpcs; ii++ { 401 | go func(i int) { 402 | ok := false 403 | defer func() { ch <- ok }() 404 | 405 | arg := 100 + i 406 | reply := "" 407 | // this call ought to return false. 408 | e.Call("JunkServer.Handler2", arg, &reply) 409 | ok = true 410 | }(ii) 411 | } 412 | 413 | time.Sleep(100 * time.Millisecond) 414 | 415 | // now enable the ClientEnd and check that an RPC completes quickly. 416 | t0 := time.Now() 417 | rn.Enable("c", true) 418 | { 419 | arg := 99 420 | reply := "" 421 | e.Call("JunkServer.Handler2", arg, &reply) 422 | wanted := "handler2-" + strconv.Itoa(arg) 423 | if reply != wanted { 424 | t.Fatalf("wrong reply %v from Handler2, expecting %v", reply, wanted) 425 | } 426 | } 427 | dur := time.Since(t0).Seconds() 428 | 429 | if dur > 0.03 { 430 | t.Fatalf("RPC took too long (%v) after Enable", dur) 431 | } 432 | 433 | for ii := 0; ii < nrpcs; ii++ { 434 | <-ch 435 | } 436 | 437 | js.mu.Lock() 438 | defer js.mu.Unlock() 439 | if len(js.log2) != 1 { 440 | t.Fatalf("wrong number (%v) of RPCs delivered, expected 1", len(js.log2)) 441 | } 442 | 443 | n := rn.GetCount(1000) 444 | if n != 1 { 445 | t.Fatalf("wrong GetCount() %v, expected %v\n", n, 1) 446 | } 447 | } 448 | 449 | // 450 | // if an RPC is stuck in a server, and the server 451 | // is killed with DeleteServer(), does the RPC 452 | // get un-stuck? 453 | // 454 | func TestKilled(t *testing.T) { 455 | runtime.GOMAXPROCS(4) 456 | 457 | rn := MakeNetwork() 458 | defer rn.Cleanup() 459 | 460 | e := rn.MakeEnd("end1-99") 461 | 462 | js := &JunkServer{} 463 | svc := MakeService(js) 464 | 465 | rs := MakeServer() 466 | rs.AddService(svc) 467 | rn.AddServer("server99", rs) 468 | 469 | rn.Connect("end1-99", "server99") 470 | rn.Enable("end1-99", true) 471 | 472 | doneCh := make(chan bool) 473 | go func() { 474 | reply := 0 475 | ok := e.Call("JunkServer.Handler3", 99, &reply) 476 | doneCh <- ok 477 | }() 478 | 479 | time.Sleep(1000 * time.Millisecond) 480 | 481 | select { 482 | case <-doneCh: 483 | t.Fatalf("Handler3 should not have returned yet") 484 | case <-time.After(100 * time.Millisecond): 485 | } 486 | 487 | rn.DeleteServer("server99") 488 | 489 | select { 490 | case x := <-doneCh: 491 | if x != false { 492 | t.Fatalf("Handler3 returned successfully despite DeleteServer()") 493 | } 494 | case <-time.After(100 * time.Millisecond): 495 | t.Fatalf("Handler3 should return after DeleteServer()") 496 | } 497 | } 498 | 499 | func TestBenchmark(t *testing.T) { 500 | runtime.GOMAXPROCS(4) 501 | 502 | rn := MakeNetwork() 503 | defer rn.Cleanup() 504 | 505 | e := rn.MakeEnd("end1-99") 506 | 507 | js := &JunkServer{} 508 | svc := MakeService(js) 509 | 510 | rs := MakeServer() 511 | rs.AddService(svc) 512 | rn.AddServer("server99", rs) 513 | 514 | rn.Connect("end1-99", "server99") 515 | rn.Enable("end1-99", true) 516 | 517 | t0 := time.Now() 518 | n := 100000 519 | for iters := 0; iters < n; iters++ { 520 | reply := "" 521 | e.Call("JunkServer.Handler2", 111, &reply) 522 | if reply != "handler2-111" { 523 | t.Fatalf("wrong reply from Handler2") 524 | } 525 | } 526 | fmt.Printf("%v for %v\n", time.Since(t0), n) 527 | // march 2016, rtm laptop, 22 microseconds per RPC 528 | } 529 | -------------------------------------------------------------------------------- /src/linearizability/bitset.go: -------------------------------------------------------------------------------- 1 | package linearizability 2 | 3 | type bitset []uint64 4 | 5 | // data layout: 6 | // bits 0-63 are in data[0], the next are in data[1], etc. 7 | 8 | func newBitset(bits uint) bitset { 9 | extra := uint(0) 10 | if bits%64 != 0 { 11 | extra = 1 12 | } 13 | chunks := bits/64 + extra 14 | return bitset(make([]uint64, chunks)) 15 | } 16 | 17 | func (b bitset) clone() bitset { 18 | dataCopy := make([]uint64, len(b)) 19 | copy(dataCopy, b) 20 | return bitset(dataCopy) 21 | } 22 | 23 | func bitsetIndex(pos uint) (uint, uint) { 24 | return pos / 64, pos % 64 25 | } 26 | 27 | func (b bitset) set(pos uint) bitset { 28 | major, minor := bitsetIndex(pos) 29 | b[major] |= (1 << minor) 30 | return b 31 | } 32 | 33 | func (b bitset) clear(pos uint) bitset { 34 | major, minor := bitsetIndex(pos) 35 | b[major] &^= (1 << minor) 36 | return b 37 | } 38 | 39 | func (b bitset) get(pos uint) bool { 40 | major, minor := bitsetIndex(pos) 41 | return b[major]&(1<> 1) 48 | v = (v & 0x3333333333333333) + ((v & 0xCCCCCCCCCCCCCCCC) >> 2) 49 | v = (v & 0x0F0F0F0F0F0F0F0F) + ((v & 0xF0F0F0F0F0F0F0F0) >> 4) 50 | v *= 0x0101010101010101 51 | total += uint((v >> 56) & 0xFF) 52 | } 53 | return total 54 | } 55 | 56 | func (b bitset) hash() uint64 { 57 | hash := uint64(b.popcnt()) 58 | for _, v := range b { 59 | hash ^= v 60 | } 61 | return hash 62 | } 63 | 64 | func (b bitset) equals(b2 bitset) bool { 65 | if len(b) != len(b2) { 66 | return false 67 | } 68 | for i := range b { 69 | if b[i] != b2[i] { 70 | return false 71 | } 72 | } 73 | return true 74 | } 75 | -------------------------------------------------------------------------------- /src/linearizability/linearizability.go: -------------------------------------------------------------------------------- 1 | package linearizability 2 | 3 | import ( 4 | "sort" 5 | "sync/atomic" 6 | "time" 7 | ) 8 | 9 | type entryKind bool 10 | 11 | const ( 12 | callEntry entryKind = false 13 | returnEntry = true 14 | ) 15 | 16 | type entry struct { 17 | kind entryKind 18 | value interface{} 19 | id uint 20 | time int64 21 | } 22 | 23 | type byTime []entry 24 | 25 | func (a byTime) Len() int { 26 | return len(a) 27 | } 28 | 29 | func (a byTime) Swap(i, j int) { 30 | a[i], a[j] = a[j], a[i] 31 | } 32 | 33 | func (a byTime) Less(i, j int) bool { 34 | return a[i].time < a[j].time 35 | } 36 | 37 | func makeEntries(history []Operation) []entry { 38 | var entries []entry = nil 39 | id := uint(0) 40 | for _, elem := range history { 41 | entries = append(entries, entry{ 42 | callEntry, elem.Input, id, elem.Call}) 43 | entries = append(entries, entry{ 44 | returnEntry, elem.Output, id, elem.Return}) 45 | id++ 46 | } 47 | sort.Sort(byTime(entries)) 48 | return entries 49 | } 50 | 51 | type node struct { 52 | value interface{} 53 | match *node // call if match is nil, otherwise return 54 | id uint 55 | next *node 56 | prev *node 57 | } 58 | 59 | func insertBefore(n *node, mark *node) *node { 60 | if mark != nil { 61 | beforeMark := mark.prev 62 | mark.prev = n 63 | n.next = mark 64 | if beforeMark != nil { 65 | n.prev = beforeMark 66 | beforeMark.next = n 67 | } 68 | } 69 | return n 70 | } 71 | 72 | func length(n *node) uint { 73 | l := uint(0) 74 | for n != nil { 75 | n = n.next 76 | l++ 77 | } 78 | return l 79 | } 80 | 81 | func renumber(events []Event) []Event { 82 | var e []Event 83 | m := make(map[uint]uint) // renumbering 84 | id := uint(0) 85 | for _, v := range events { 86 | if r, ok := m[v.Id]; ok { 87 | e = append(e, Event{v.Kind, v.Value, r}) 88 | } else { 89 | e = append(e, Event{v.Kind, v.Value, id}) 90 | m[v.Id] = id 91 | id++ 92 | } 93 | } 94 | return e 95 | } 96 | 97 | func convertEntries(events []Event) []entry { 98 | var entries []entry 99 | for _, elem := range events { 100 | kind := callEntry 101 | if elem.Kind == ReturnEvent { 102 | kind = returnEntry 103 | } 104 | entries = append(entries, entry{kind, elem.Value, elem.Id, -1}) 105 | } 106 | return entries 107 | } 108 | 109 | func makeLinkedEntries(entries []entry) *node { 110 | var root *node = nil 111 | match := make(map[uint]*node) 112 | for i := len(entries) - 1; i >= 0; i-- { 113 | elem := entries[i] 114 | if elem.kind == returnEntry { 115 | entry := &node{value: elem.value, match: nil, id: elem.id} 116 | match[elem.id] = entry 117 | insertBefore(entry, root) 118 | root = entry 119 | } else { 120 | entry := &node{value: elem.value, match: match[elem.id], id: elem.id} 121 | insertBefore(entry, root) 122 | root = entry 123 | } 124 | } 125 | return root 126 | } 127 | 128 | type cacheEntry struct { 129 | linearized bitset 130 | state interface{} 131 | } 132 | 133 | func cacheContains(model Model, cache map[uint64][]cacheEntry, entry cacheEntry) bool { 134 | for _, elem := range cache[entry.linearized.hash()] { 135 | if entry.linearized.equals(elem.linearized) && model.Equal(entry.state, elem.state) { 136 | return true 137 | } 138 | } 139 | return false 140 | } 141 | 142 | type callsEntry struct { 143 | entry *node 144 | state interface{} 145 | } 146 | 147 | func lift(entry *node) { 148 | entry.prev.next = entry.next 149 | entry.next.prev = entry.prev 150 | match := entry.match 151 | match.prev.next = match.next 152 | if match.next != nil { 153 | match.next.prev = match.prev 154 | } 155 | } 156 | 157 | func unlift(entry *node) { 158 | match := entry.match 159 | match.prev.next = match 160 | if match.next != nil { 161 | match.next.prev = match 162 | } 163 | entry.prev.next = entry 164 | entry.next.prev = entry 165 | } 166 | 167 | func checkSingle(model Model, subhistory *node, kill *int32) bool { 168 | n := length(subhistory) / 2 169 | linearized := newBitset(n) 170 | cache := make(map[uint64][]cacheEntry) // map from hash to cache entry 171 | var calls []callsEntry 172 | 173 | state := model.Init() 174 | headEntry := insertBefore(&node{value: nil, match: nil, id: ^uint(0)}, subhistory) 175 | entry := subhistory 176 | for headEntry.next != nil { 177 | if atomic.LoadInt32(kill) != 0 { 178 | return false 179 | } 180 | if entry.match != nil { 181 | matching := entry.match // the return entry 182 | ok, newState := model.Step(state, entry.value, matching.value) 183 | if ok { 184 | newLinearized := linearized.clone().set(entry.id) 185 | newCacheEntry := cacheEntry{newLinearized, newState} 186 | if !cacheContains(model, cache, newCacheEntry) { 187 | hash := newLinearized.hash() 188 | cache[hash] = append(cache[hash], newCacheEntry) 189 | calls = append(calls, callsEntry{entry, state}) 190 | state = newState 191 | linearized.set(entry.id) 192 | lift(entry) 193 | entry = headEntry.next 194 | } else { 195 | entry = entry.next 196 | } 197 | } else { 198 | entry = entry.next 199 | } 200 | } else { 201 | if len(calls) == 0 { 202 | return false 203 | } 204 | callsTop := calls[len(calls)-1] 205 | entry = callsTop.entry 206 | state = callsTop.state 207 | linearized.clear(entry.id) 208 | calls = calls[:len(calls)-1] 209 | unlift(entry) 210 | entry = entry.next 211 | } 212 | } 213 | return true 214 | } 215 | 216 | func fillDefault(model Model) Model { 217 | if model.Partition == nil { 218 | model.Partition = NoPartition 219 | } 220 | if model.PartitionEvent == nil { 221 | model.PartitionEvent = NoPartitionEvent 222 | } 223 | if model.Equal == nil { 224 | model.Equal = ShallowEqual 225 | } 226 | return model 227 | } 228 | 229 | func CheckOperations(model Model, history []Operation) bool { 230 | return CheckOperationsTimeout(model, history, 0) 231 | } 232 | 233 | // timeout = 0 means no timeout 234 | // if this operation times out, then a false positive is possible 235 | func CheckOperationsTimeout(model Model, history []Operation, timeout time.Duration) bool { 236 | model = fillDefault(model) 237 | partitions := model.Partition(history) 238 | ok := true 239 | results := make(chan bool) 240 | kill := int32(0) 241 | for _, subhistory := range partitions { 242 | l := makeLinkedEntries(makeEntries(subhistory)) 243 | go func() { 244 | results <- checkSingle(model, l, &kill) 245 | }() 246 | } 247 | var timeoutChan <-chan time.Time 248 | if timeout > 0 { 249 | timeoutChan = time.After(timeout) 250 | } 251 | count := 0 252 | loop: 253 | for { 254 | select { 255 | case result := <-results: 256 | ok = ok && result 257 | if !ok { 258 | atomic.StoreInt32(&kill, 1) 259 | break loop 260 | } 261 | count++ 262 | if count >= len(partitions) { 263 | break loop 264 | } 265 | case <-timeoutChan: 266 | break loop // if we time out, we might get a false positive 267 | } 268 | } 269 | return ok 270 | } 271 | 272 | func CheckEvents(model Model, history []Event) bool { 273 | return CheckEventsTimeout(model, history, 0) 274 | } 275 | 276 | // timeout = 0 means no timeout 277 | // if this operation times out, then a false positive is possible 278 | func CheckEventsTimeout(model Model, history []Event, timeout time.Duration) bool { 279 | model = fillDefault(model) 280 | partitions := model.PartitionEvent(history) 281 | ok := true 282 | results := make(chan bool) 283 | kill := int32(0) 284 | for _, subhistory := range partitions { 285 | l := makeLinkedEntries(convertEntries(renumber(subhistory))) 286 | go func() { 287 | results <- checkSingle(model, l, &kill) 288 | }() 289 | } 290 | var timeoutChan <-chan time.Time 291 | if timeout > 0 { 292 | timeoutChan = time.After(timeout) 293 | } 294 | count := 0 295 | loop: 296 | for { 297 | select { 298 | case result := <-results: 299 | ok = ok && result 300 | if !ok { 301 | atomic.StoreInt32(&kill, 1) 302 | break loop 303 | } 304 | count++ 305 | if count >= len(partitions) { 306 | break loop 307 | } 308 | case <-timeoutChan: 309 | break loop // if we time out, we might get a false positive 310 | } 311 | } 312 | return ok 313 | } 314 | -------------------------------------------------------------------------------- /src/linearizability/model.go: -------------------------------------------------------------------------------- 1 | package linearizability 2 | 3 | type Operation struct { 4 | Input interface{} 5 | Call int64 // invocation time 6 | Output interface{} 7 | Return int64 // response time 8 | } 9 | 10 | type EventKind bool 11 | 12 | const ( 13 | CallEvent EventKind = false 14 | ReturnEvent EventKind = true 15 | ) 16 | 17 | type Event struct { 18 | Kind EventKind 19 | Value interface{} 20 | Id uint 21 | } 22 | 23 | type Model struct { 24 | // Partition functions, such that a history is linearizable if an only 25 | // if each partition is linearizable. If you don't want to implement 26 | // this, you can always use the `NoPartition` functions implemented 27 | // below. 28 | Partition func(history []Operation) [][]Operation 29 | PartitionEvent func(history []Event) [][]Event 30 | // Initial state of the system. 31 | Init func() interface{} 32 | // Step function for the system. Returns whether or not the system 33 | // could take this step with the given inputs and outputs and also 34 | // returns the new state. This should not mutate the existing state. 35 | Step func(state interface{}, input interface{}, output interface{}) (bool, interface{}) 36 | // Equality on states. If you are using a simple data type for states, 37 | // you can use the `ShallowEqual` function implemented below. 38 | Equal func(state1, state2 interface{}) bool 39 | } 40 | 41 | func NoPartition(history []Operation) [][]Operation { 42 | return [][]Operation{history} 43 | } 44 | 45 | func NoPartitionEvent(history []Event) [][]Event { 46 | return [][]Event{history} 47 | } 48 | 49 | func ShallowEqual(state1, state2 interface{}) bool { 50 | return state1 == state2 51 | } 52 | -------------------------------------------------------------------------------- /src/linearizability/models.go: -------------------------------------------------------------------------------- 1 | package linearizability 2 | 3 | // kv model 4 | 5 | type KvInput struct { 6 | Op uint8 // 0 => get, 1 => put, 2 => append 7 | Key string 8 | Value string 9 | } 10 | 11 | type KvOutput struct { 12 | Value string 13 | } 14 | 15 | func KvModel() Model { 16 | return Model{ 17 | Partition: func(history []Operation) [][]Operation { 18 | m := make(map[string][]Operation) 19 | for _, v := range history { 20 | key := v.Input.(KvInput).Key 21 | m[key] = append(m[key], v) 22 | } 23 | var ret [][]Operation 24 | for _, v := range m { 25 | ret = append(ret, v) 26 | } 27 | return ret 28 | }, 29 | Init: func() interface{} { 30 | // note: we are modeling a single key's value here; 31 | // we're partitioning by key, so this is okay 32 | return "" 33 | }, 34 | Step: func(state, input, output interface{}) (bool, interface{}) { 35 | inp := input.(KvInput) 36 | out := output.(KvOutput) 37 | st := state.(string) 38 | if inp.Op == 0 { 39 | // get 40 | return out.Value == st, state 41 | } else if inp.Op == 1 { 42 | // put 43 | return true, inp.Value 44 | } else { 45 | // append 46 | return true, (st + inp.Value) 47 | } 48 | }, 49 | Equal: ShallowEqual, 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/diskvd.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // start a diskvd server. it's a member of some replica 5 | // group, which has other members, and it needs to know 6 | // how to talk to the members of the shardmaster service. 7 | // used by ../diskv/test_test.go 8 | // 9 | // arguments: 10 | // -g groupid 11 | // -m masterport1 -m masterport2 ... 12 | // -s replicaport1 -s replicaport2 ... 13 | // -i my-index-in-server-port-list 14 | // -u unreliable 15 | // -d directory 16 | // -r restart 17 | 18 | import "time" 19 | import "diskv" 20 | import "os" 21 | import "fmt" 22 | import "strconv" 23 | import "runtime" 24 | 25 | func usage() { 26 | fmt.Printf("Usage: diskvd -g gid -m master... -s server... -i my-index -d dir\n") 27 | os.Exit(1) 28 | } 29 | 30 | func main() { 31 | var gid int64 = -1 // my replica group ID 32 | masters := []string{} // ports of shardmasters 33 | replicas := []string{} // ports of servers in my replica group 34 | me := -1 // my index in replicas[] 35 | unreliable := false 36 | dir := "" // store persistent data here 37 | restart := false 38 | 39 | for i := 1; i+1 < len(os.Args); i += 2 { 40 | a0 := os.Args[i] 41 | a1 := os.Args[i+1] 42 | if a0 == "-g" { 43 | gid, _ = strconv.ParseInt(a1, 10, 64) 44 | } else if a0 == "-m" { 45 | masters = append(masters, a1) 46 | } else if a0 == "-s" { 47 | replicas = append(replicas, a1) 48 | } else if a0 == "-i" { 49 | me, _ = strconv.Atoi(a1) 50 | } else if a0 == "-u" { 51 | unreliable, _ = strconv.ParseBool(a1) 52 | } else if a0 == "-d" { 53 | dir = a1 54 | } else if a0 == "-r" { 55 | restart, _ = strconv.ParseBool(a1) 56 | } else { 57 | usage() 58 | } 59 | } 60 | 61 | if gid < 0 || me < 0 || len(masters) < 1 || me >= len(replicas) || dir == "" { 62 | usage() 63 | } 64 | 65 | runtime.GOMAXPROCS(4) 66 | 67 | srv := diskv.StartServer(gid, masters, replicas, me, dir, restart) 68 | srv.Setunreliable(unreliable) 69 | 70 | // for safety, force quit after 10 minutes. 71 | time.Sleep(10 * 60 * time.Second) 72 | mep, _ := os.FindProcess(os.Getpid()) 73 | mep.Kill() 74 | } 75 | -------------------------------------------------------------------------------- /src/main/lockc.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // see comments in lockd.go 5 | // 6 | 7 | import "lockservice" 8 | import "os" 9 | import "fmt" 10 | 11 | func usage() { 12 | fmt.Printf("Usage: lockc -l|-u primaryport backupport lockname\n") 13 | os.Exit(1) 14 | } 15 | 16 | func main() { 17 | if len(os.Args) == 5 { 18 | ck := lockservice.MakeClerk(os.Args[2], os.Args[3]) 19 | var ok bool 20 | if os.Args[1] == "-l" { 21 | ok = ck.Lock(os.Args[4]) 22 | } else if os.Args[1] == "-u" { 23 | ok = ck.Unlock(os.Args[4]) 24 | } else { 25 | usage() 26 | } 27 | fmt.Printf("reply: %v\n", ok) 28 | } else { 29 | usage() 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/lockd.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // export GOPATH=~/6.824 4 | // go build lockd.go 5 | // go build lockc.go 6 | // ./lockd -p a b & 7 | // ./lockd -b a b & 8 | // ./lockc -l a b lx 9 | // ./lockc -u a b lx 10 | // 11 | // on Athena, use /tmp/myname-a and /tmp/myname-b 12 | // instead of a and b. 13 | 14 | import "time" 15 | import "lockservice" 16 | import "os" 17 | import "fmt" 18 | 19 | func main() { 20 | if len(os.Args) == 4 && os.Args[1] == "-p" { 21 | lockservice.StartServer(os.Args[2], os.Args[3], true) 22 | } else if len(os.Args) == 4 && os.Args[1] == "-b" { 23 | lockservice.StartServer(os.Args[2], os.Args[3], false) 24 | } else { 25 | fmt.Printf("Usage: lockd -p|-b primaryport backupport\n") 26 | os.Exit(1) 27 | } 28 | for { 29 | time.Sleep(100 * time.Second) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/mrmaster.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // start the master process, which is implemented 5 | // in ../mr/master.go 6 | // 7 | // go run mrmaster.go pg*.txt 8 | // 9 | 10 | import "Mit6824/src/mr" 11 | import "time" 12 | import "os" 13 | import "fmt" 14 | 15 | func main() { 16 | if len(os.Args) < 2 { 17 | fmt.Fprintf(os.Stderr, "Usage: mrmaster inputfiles...\n") 18 | os.Exit(1) 19 | } 20 | 21 | m := mr.MakeMaster(os.Args[1:], 10) 22 | for m.Done() == false { 23 | time.Sleep(time.Second) 24 | } 25 | 26 | time.Sleep(time.Second) 27 | } 28 | -------------------------------------------------------------------------------- /src/main/mrsequential.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // simple sequential MapReduce. 5 | // 6 | // go run mrsequential.go ../mrapps/wc.so pg*.txt 7 | // 8 | 9 | import "fmt" 10 | import "Mit6824/src/mr" 11 | import "plugin" 12 | import "os" 13 | import "log" 14 | import "io/ioutil" 15 | import "sort" 16 | 17 | // for sorting by key. 18 | type ByKey []mr.KeyValue 19 | 20 | // for sorting by key. 21 | func (a ByKey) Len() int { return len(a) } 22 | func (a ByKey) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 23 | func (a ByKey) Less(i, j int) bool { return a[i].Key < a[j].Key } 24 | 25 | func main() { 26 | if len(os.Args) < 3 { 27 | fmt.Fprintf(os.Stderr, "Usage: mrsequential ../mrapps/xxx.so inputfiles...\n") 28 | os.Exit(1) 29 | } 30 | 31 | mapf, reducef := loadPlugin(os.Args[1]) 32 | 33 | // 34 | // read each input file, 35 | // pass it to Map, 36 | // accumulate the intermediate Map output. 37 | // 38 | intermediate := []mr.KeyValue{} 39 | for _, filename := range os.Args[2:] { 40 | file, err := os.Open(filename) 41 | if err != nil { 42 | log.Fatalf("cannot open %v", filename) 43 | } 44 | content, err := ioutil.ReadAll(file) 45 | if err != nil { 46 | log.Fatalf("cannot read %v", filename) 47 | } 48 | file.Close() 49 | kva := mapf(filename, string(content)) 50 | intermediate = append(intermediate, kva...) 51 | } 52 | 53 | // 54 | // a big difference from real MapReduce is that all the 55 | // intermediate data is in one place, intermediate[], 56 | // rather than being partitioned into NxM buckets. 57 | // 58 | 59 | sort.Sort(ByKey(intermediate)) 60 | 61 | oname := "mr-out-0" 62 | ofile, _ := os.Create(oname) 63 | 64 | // 65 | // call Reduce on each distinct key in intermediate[], 66 | // and print the result to mr-out-0. 67 | // 68 | i := 0 69 | for i < len(intermediate) { 70 | j := i + 1 71 | for j < len(intermediate) && intermediate[j].Key == intermediate[i].Key { 72 | j++ 73 | } 74 | values := []string{} 75 | for k := i; k < j; k++ { 76 | values = append(values, intermediate[k].Value) 77 | } 78 | output := reducef(intermediate[i].Key, values) 79 | 80 | // this is the correct format for each line of Reduce output. 81 | fmt.Fprintf(ofile, "%v %v\n", intermediate[i].Key, output) 82 | 83 | i = j 84 | } 85 | 86 | ofile.Close() 87 | } 88 | 89 | // 90 | // load the application Map and Reduce functions 91 | // from a plugin file, e.g. ../mrapps/wc.so 92 | // 93 | func loadPlugin(filename string) (func(string, string) []mr.KeyValue, func(string, []string) string) { 94 | p, err := plugin.Open(filename) 95 | if err != nil { 96 | log.Fatalf("cannot load plugin %v", filename) 97 | } 98 | xmapf, err := p.Lookup("Map") 99 | if err != nil { 100 | log.Fatalf("cannot find Map in %v", filename) 101 | } 102 | mapf := xmapf.(func(string, string) []mr.KeyValue) 103 | xreducef, err := p.Lookup("Reduce") 104 | if err != nil { 105 | log.Fatalf("cannot find Reduce in %v", filename) 106 | } 107 | reducef := xreducef.(func(string, []string) string) 108 | 109 | return mapf, reducef 110 | } 111 | -------------------------------------------------------------------------------- /src/main/mrworker.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // start a worker process, which is implemented 5 | // in ../mr/worker.go. typically there will be 6 | // multiple worker processes, talking to one master. 7 | // 8 | // go run mrworker.go ../mrapps/wc.so 9 | // 10 | 11 | import "Mit6824/src/mr" 12 | import "plugin" 13 | import "os" 14 | import "fmt" 15 | import "log" 16 | 17 | func main() { 18 | if len(os.Args) != 2 { 19 | fmt.Fprintf(os.Stderr, "Usage: mrworker ../mrapps/xxx.so\n") 20 | os.Exit(1) 21 | } 22 | 23 | mapf, reducef := loadPlugin(os.Args[1]) 24 | 25 | mr.Worker(mapf, reducef) 26 | } 27 | 28 | // 29 | // load the application Map and Reduce functions 30 | // from a plugin file, e.g. ../mrapps/wc.so 31 | // 32 | func loadPlugin(filename string) (func(string, string) []mr.KeyValue, func(string, []string) string) { 33 | p, err := plugin.Open(filename) 34 | if err != nil { 35 | log.Fatalf("cannot load plugin %v", filename) 36 | } 37 | xmapf, err := p.Lookup("Map") 38 | if err != nil { 39 | log.Fatalf("cannot find Map in %v", filename) 40 | } 41 | mapf := xmapf.(func(string, string) []mr.KeyValue) 42 | xreducef, err := p.Lookup("Reduce") 43 | if err != nil { 44 | log.Fatalf("cannot find Reduce in %v", filename) 45 | } 46 | reducef := xreducef.(func(string, []string) string) 47 | 48 | return mapf, reducef 49 | } 50 | -------------------------------------------------------------------------------- /src/main/pbc.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // pbservice client application 5 | // 6 | // export GOPATH=~/6.824 7 | // go build viewd.go 8 | // go build pbd.go 9 | // go build pbc.go 10 | // ./viewd /tmp/rtm-v & 11 | // ./pbd /tmp/rtm-v /tmp/rtm-1 & 12 | // ./pbd /tmp/rtm-v /tmp/rtm-2 & 13 | // ./pbc /tmp/rtm-v key1 value1 14 | // ./pbc /tmp/rtm-v key1 15 | // 16 | // change "rtm" to your user name. 17 | // start the pbd programs in separate windows and kill 18 | // and restart them to exercise fault tolerance. 19 | // 20 | 21 | import "pbservice" 22 | import "os" 23 | import "fmt" 24 | 25 | func usage() { 26 | fmt.Printf("Usage: pbc viewport key\n") 27 | fmt.Printf(" pbc viewport key value\n") 28 | os.Exit(1) 29 | } 30 | 31 | func main() { 32 | if len(os.Args) == 3 { 33 | // get 34 | ck := pbservice.MakeClerk(os.Args[1], "") 35 | v := ck.Get(os.Args[2]) 36 | fmt.Printf("%v\n", v) 37 | } else if len(os.Args) == 4 { 38 | // put 39 | ck := pbservice.MakeClerk(os.Args[1], "") 40 | ck.Put(os.Args[2], os.Args[3]) 41 | } else { 42 | usage() 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/pbd.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // see directions in pbc.go 5 | // 6 | 7 | import "time" 8 | import "pbservice" 9 | import "os" 10 | import "fmt" 11 | 12 | func main() { 13 | if len(os.Args) != 3 { 14 | fmt.Printf("Usage: pbd viewport myport\n") 15 | os.Exit(1) 16 | } 17 | 18 | pbservice.StartServer(os.Args[1], os.Args[2]) 19 | 20 | for { 21 | time.Sleep(100 * time.Second) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/test-mr.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # 4 | # basic map-reduce test 5 | # 6 | 7 | RACE= 8 | 9 | # uncomment this to run the tests with the Go race detector. 10 | #RACE=-race 11 | 12 | # run the test in a fresh sub-directory. 13 | rm -rf mr-tmp 14 | mkdir mr-tmp || exit 1 15 | cd mr-tmp || exit 1 16 | rm -f mr-* 17 | 18 | # make sure software is freshly built. 19 | (cd ../../mrapps && go build $RACE -buildmode=plugin wc.go) || exit 1 20 | (cd ../../mrapps && go build $RACE -buildmode=plugin indexer.go) || exit 1 21 | (cd ../../mrapps && go build $RACE -buildmode=plugin mtiming.go) || exit 1 22 | (cd ../../mrapps && go build $RACE -buildmode=plugin rtiming.go) || exit 1 23 | (cd ../../mrapps && go build $RACE -buildmode=plugin crash.go) || exit 1 24 | (cd ../../mrapps && go build $RACE -buildmode=plugin nocrash.go) || exit 1 25 | (cd .. && go build $RACE mrmaster.go) || exit 1 26 | (cd .. && go build $RACE mrworker.go) || exit 1 27 | (cd .. && go build $RACE mrsequential.go) || exit 1 28 | 29 | # first word-count 30 | 31 | # generate the correct output 32 | ../mrsequential ../../mrapps/wc.so ../pg*txt || exit 1 33 | sort mr-out-0 > mr-correct-wc.txt 34 | rm -f mr-out* 35 | 36 | echo '***' Starting wc test. 37 | 38 | ../mrmaster ../pg*txt & 39 | sleep 1 40 | 41 | # start multiple workers 42 | ../mrworker ../../mrapps/wc.so & 43 | ../mrworker ../../mrapps/wc.so & 44 | ../mrworker ../../mrapps/wc.so 45 | 46 | sort mr-out* > mr-wc-all 47 | if cmp mr-wc-all mr-correct-wc.txt 48 | then 49 | echo '---' wc test: PASS 50 | else 51 | echo '---' wc output is not the same as mr-correct-wc.txt 52 | echo '---' wc test: FAIL 53 | exit 1 54 | fi 55 | 56 | # now indexer 57 | rm -f mr-* 58 | 59 | # generate the correct output 60 | ../mrsequential ../../mrapps/indexer.so ../pg*txt || exit 1 61 | sort mr-out-0 > mr-correct-indexer.txt 62 | rm -f mr-out* 63 | 64 | echo '***' Starting indexer test. 65 | 66 | ../mrmaster ../pg*txt & 67 | sleep 1 68 | 69 | # start multiple workers 70 | ../mrworker ../../mrapps/indexer.so & 71 | ../mrworker ../../mrapps/indexer.so 72 | 73 | sort mr-out* > mr-indexer-all 74 | if cmp mr-indexer-all mr-correct-indexer.txt 75 | then 76 | echo '---' indexer test: PASS 77 | else 78 | echo '---' indexer output is not the same as mr-correct-indexer.txt 79 | echo '---' indexer test: FAIL 80 | exit 1 81 | fi 82 | 83 | 84 | 85 | 86 | echo '***' Starting map parallelism test. 87 | 88 | rm -f mr-out* mr-worker* 89 | 90 | ../mrmaster ../pg*txt & 91 | sleep 1 92 | 93 | ../mrworker ../../mrapps/mtiming.so & 94 | ../mrworker ../../mrapps/mtiming.so 95 | 96 | NT=`cat mr-out* | grep '^times-' | wc -l | sed 's/ //g'` 97 | if [ "$NT" != "2" ] 98 | then 99 | echo '---' saw "$NT" workers rather than 2 100 | echo '---' map parallelism test: FAIL 101 | exit 1 102 | fi 103 | 104 | if cat mr-out* | grep '^parallel.* 2' > /dev/null 105 | then 106 | echo '---' map parallelism test: PASS 107 | else 108 | echo '---' map workers did not run in parallel 109 | echo '---' map parallelism test: FAIL 110 | exit 1 111 | fi 112 | 113 | 114 | echo '***' Starting reduce parallelism test. 115 | 116 | rm -f mr-out* mr-worker* 117 | 118 | ../mrmaster ../pg*txt & 119 | sleep 1 120 | 121 | ../mrworker ../../mrapps/rtiming.so & 122 | ../mrworker ../../mrapps/rtiming.so 123 | 124 | NT=`cat mr-out* | grep '^[a-z] 2' | wc -l | sed 's/ //g'` 125 | if [ "$NT" -lt "2" ] 126 | then 127 | echo '---' too few parallel reduces. 128 | echo '---' reduce parallelism test: FAIL 129 | exit 1 130 | else 131 | echo '---' reduce parallelism test: PASS 132 | fi 133 | 134 | 135 | 136 | # generate the correct output 137 | ../mrsequential ../../mrapps/nocrash.so ../pg*txt || exit 1 138 | sort mr-out-0 > mr-correct-crash.txt 139 | rm -f mr-out* 140 | 141 | echo '***' Starting crash test. 142 | 143 | rm -f mr-done 144 | (../mrmaster ../pg*txt ; touch mr-done ) & 145 | sleep 1 146 | 147 | # start multiple workers 148 | ../mrworker ../../mrapps/crash.so & 149 | 150 | ( while [ -e mr-socket -a ! -f mr-done ] 151 | do 152 | ../mrworker ../../mrapps/crash.so 153 | sleep 1 154 | done ) & 155 | 156 | ( while [ -e mr-socket -a ! -f mr-done ] 157 | do 158 | ../mrworker ../../mrapps/crash.so 159 | sleep 1 160 | done ) & 161 | 162 | while [ -e mr-socket -a ! -f mr-done ] 163 | do 164 | ../mrworker ../../mrapps/crash.so 165 | sleep 1 166 | done 167 | 168 | wait 169 | wait 170 | wait 171 | 172 | rm mr-socket 173 | sort mr-out* > mr-crash-all 174 | if cmp mr-crash-all mr-correct-crash.txt 175 | then 176 | echo '---' crash test: PASS 177 | else 178 | echo '---' crash output is not the same as mr-correct-crash.txt 179 | echo '---' crash test: FAIL 180 | exit 1 181 | fi 182 | 183 | echo '***' PASSED ALL TESTS 184 | -------------------------------------------------------------------------------- /src/main/viewd.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // see directions in pbc.go 5 | // 6 | 7 | import "time" 8 | import "viewservice" 9 | import "os" 10 | import "fmt" 11 | 12 | func main() { 13 | if len(os.Args) != 2 { 14 | fmt.Printf("Usage: viewd port\n") 15 | os.Exit(1) 16 | } 17 | 18 | viewservice.StartServer(os.Args[1]) 19 | 20 | for { 21 | time.Sleep(100 * time.Second) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/mr/master.go: -------------------------------------------------------------------------------- 1 | package mr 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | "sync" 8 | "sync/atomic" 9 | "time" 10 | ) 11 | import "net" 12 | import "net/rpc" 13 | import "net/http" 14 | 15 | var dispatcher *Dispatcher 16 | 17 | // 主节点 18 | type Master struct { 19 | // Your definitions here. 20 | S *JobState 21 | TP *TaskPool 22 | W *sync.Map 23 | } 24 | 25 | // Job 状态 26 | type JobState struct { 27 | MatrixSource [][]string // MC * RC 28 | MC int 29 | RC int 30 | MCDone int32 31 | nextWorkerID uint64 32 | allDone int // 用于标示全部完成的状态 0代表没有全部完成 1 代表已经全部完成 进入优雅关闭状态 33 | } 34 | 35 | // 任务池 36 | type TaskPool struct { 37 | Pool chan *Task 38 | } 39 | 40 | // 任务 41 | type Task struct { 42 | Status int // 0 未完成 1工作中 2已完成 43 | Type int // 0 map 任务 1 reduce 任务 2 shut down 3 retry 44 | Conf *TaskConf 45 | } 46 | 47 | // 任务配置 48 | type TaskConf struct { 49 | Source []string // 兼容两种任务 50 | RNum int // 当前 map 任务的 任务编号 如果是reduce任务 则为-1 51 | MNum int // 当前 reduce 任务的 任务编号 如果是map任务 则为-1 52 | RC int // reduce 的任务数 53 | } 54 | 55 | // 定时清理器 56 | type Dispatcher struct { 57 | TimeOut time.Duration //默认10秒 58 | M *Master //主节点全局结构 59 | ReduceSourceChan chan *ReduceSource // 发送 reduce 的任务 执行内容 60 | CleanWorkerChan chan uint64 // 清理失效的worker 61 | } 62 | 63 | // 工作者会话管理器 64 | type WorkerSession struct { 65 | WorkerID uint64 66 | Status int // 0 空闲状态 1 工作状态 2 无法正常工作 67 | T *Task 68 | Mux *sync.RWMutex 69 | LastPingTs int64 70 | PingPongChan chan struct{} 71 | } 72 | 73 | type ReduceSource struct { 74 | MIdx int 75 | MapSource []string // map 任务返回的 source 列表 76 | } 77 | 78 | // Your code here -- RPC handlers for the worker to call. 79 | 80 | // 81 | // an example RPC handler. 82 | // 83 | func (m *Master) Example(args *ExampleArgs, reply *ExampleReply) error { 84 | reply.Y = args.X + 1 85 | return nil 86 | } 87 | 88 | func (m *Master) RegisterWorker(args *RegisterReq, reply *RegisterRes) error { 89 | _ = args 90 | for { 91 | assignID := atomic.LoadUint64(&m.S.nextWorkerID) 92 | if atomic.CompareAndSwapUint64(&m.S.nextWorkerID, assignID, assignID+1) { 93 | reply.WorkerID = assignID 94 | ws := &WorkerSession{ 95 | WorkerID: assignID, 96 | Status: 0, // 0 代表 健康良好 1 代表失联 97 | T: nil, // 正在执行的任务 98 | LastPingTs: time.Now().UnixNano() / 1e6, 99 | Mux: &sync.RWMutex{}, 100 | PingPongChan: make(chan struct{}), 101 | } 102 | m.W.Store(assignID, ws) 103 | go ws.PingPong(dispatcher.TimeOut) 104 | return nil 105 | } 106 | // TODO:不应该无限重试 应该设置一个限制 107 | time.Sleep(10 * time.Millisecond) 108 | } 109 | } 110 | 111 | func (m *Master) GetTaskWorker(args *GetTaskReq, reply *GetTaskRes) error { 112 | // 延迟 5秒后 若五任务就返回 113 | c := time.After(5 * time.Second) 114 | if worker, ok := m.W.Load(args.WorkerID); ok { 115 | w := worker.(*WorkerSession) 116 | select { 117 | case task, ok := <-m.TP.Pool: 118 | if !ok { 119 | shutdown(reply) 120 | m.W.Delete(w.WorkerID) 121 | return nil 122 | } 123 | task.Status = 1 124 | reply.T = task 125 | w.Mux.Lock() 126 | defer w.Mux.Unlock() 127 | w.Status = 1 // 任务负载 128 | w.T = task 129 | case <-c: 130 | // 返回nil 131 | } 132 | } else { 133 | shutdown(reply) 134 | m.W.Delete(args.WorkerID) 135 | } 136 | return nil 137 | } 138 | 139 | func (m *Master) ReportResult(args *ResultReq, reply *ResultRes) error { 140 | //fmt.Println("ReportResult", args.WorkerID, args.M, args.Code) 141 | if len(args.M) == 0 { 142 | reply.Code = 1 143 | reply.Msg = "The report cannot be empty" 144 | return nil 145 | } 146 | if ws, ok := m.W.Load(args.WorkerID); ok { 147 | w := ws.(*WorkerSession) 148 | switch args.Code { 149 | case 0: // map 150 | if w.T == nil { 151 | reply.Msg = "shut down!!!" 152 | reply.Code = 1 153 | return nil 154 | } 155 | dispatcher.ReduceSourceChan <- &ReduceSource{ 156 | MIdx: w.T.Conf.MNum, 157 | MapSource: args.M, 158 | } 159 | case 1: // reduce 160 | if w.T == nil { 161 | reply.Msg = "shut down!!!" 162 | reply.Code = 1 163 | return nil 164 | } 165 | m.S.MatrixSource[m.S.MC][w.T.Conf.RNum] = "done" 166 | case 2: // failed 167 | task := w.T 168 | m.W.Delete(args.WorkerID) 169 | task.Status = 0 // 重新置为 未分配状态 170 | m.TP.Pool <- task // 将任务重新加入队列 171 | reply.Code = 0 172 | return nil 173 | default: 174 | reply.Code = 1 175 | reply.Msg = fmt.Sprintf("Code %d do not recognize", args.Code) 176 | return nil 177 | } 178 | w.Mux.Lock() 179 | defer w.Mux.Unlock() 180 | w.Status = 0 // 节点空闲 181 | w.T = nil // 任务完成 182 | w.LastPingTs = time.Now().UnixNano() / 1e6 // 更新会话时间戳 183 | reply.Code = 0 184 | return nil 185 | } 186 | reply.Code = 1 187 | reply.Msg = "unregistered" 188 | return nil 189 | } 190 | 191 | func (m *Master) PingPong(args *Ping, reply *Pong) error { 192 | if ws, ok := m.W.Load(args.WorkerID); ok { 193 | w := ws.(*WorkerSession) 194 | w.Mux.Lock() 195 | defer w.Mux.Unlock() 196 | w.LastPingTs = time.Now().UnixNano() / 1e6 // 更新会话时间戳 197 | w.PingPongChan <- struct{}{} 198 | } 199 | reply.Code = 0 200 | return nil 201 | } 202 | 203 | func shutdown(reply *GetTaskRes) { 204 | reply.Msg = "shut down!!!" 205 | reply.T = &Task{ 206 | Status: 0, 207 | Type: 2, 208 | Conf: &TaskConf{Source: []string{}}, 209 | } 210 | } 211 | 212 | func (d *Dispatcher) cleanSession() { 213 | for workerID := range d.CleanWorkerChan { 214 | if w, ok := d.M.W.Load(workerID); ok { 215 | worker := w.(*WorkerSession) 216 | worker.Mux.Lock() 217 | task := worker.T 218 | worker.T = nil 219 | worker.Mux.Unlock() 220 | if task != nil { 221 | task.Status = 0 222 | //fmt.Println("cleanSession.task",workerID,task.Status,task.Conf.Source) 223 | d.M.TP.Pool <- task 224 | } 225 | d.M.W.Delete(worker) 226 | //fmt.Println("cleanSession.worker",workerID) 227 | } 228 | } 229 | } 230 | 231 | func (d *Dispatcher) updateJobState() { 232 | for rs := range d.ReduceSourceChan { 233 | d.M.S.MatrixSource[rs.MIdx] = rs.MapSource 234 | atomic.AddInt32(&d.M.S.MCDone, 1) 235 | if atomic.LoadInt32(&d.M.S.MCDone) == int32(d.M.S.MC) { 236 | //fmt.Println(d.M.S.MCDone) 237 | for j := 0; j < d.M.S.RC; j++ { 238 | sources := make([]string, 0) 239 | for i := 0; i < d.M.S.MC; i++ { 240 | sources = append(sources, d.M.S.MatrixSource[i][j]) 241 | } 242 | d.M.TP.Pool <- &Task{ 243 | Status: 0, 244 | Type: 1, // Reduce 任务 245 | Conf: &TaskConf{ 246 | Source: sources, 247 | RNum: j, 248 | MNum: -1, 249 | RC: d.M.S.RC, 250 | }, 251 | } 252 | d.M.S.MatrixSource[d.M.S.MC][j] = "created" 253 | //fmt.Println(sources, d.M.S.MatrixSource[d.M.S.MC][j]) 254 | } 255 | } 256 | } 257 | } 258 | 259 | func (d *Dispatcher) run() { 260 | go d.cleanSession() 261 | go d.updateJobState() 262 | } 263 | 264 | func (w *WorkerSession) PingPong(ts time.Duration) { 265 | for { 266 | tc := time.NewTicker(ts) 267 | select { 268 | case _ = <-tc.C: 269 | dispatcher.CleanWorkerChan <- w.WorkerID 270 | case _ = <-w.PingPongChan: 271 | tc.Stop() 272 | // TODO: 这里应该 有一个 close 信号将协程退出 否则程序中会存在大量无用的协程 存在泄露的风险 273 | } 274 | } 275 | } 276 | 277 | // 278 | // start a thread that listens for RPCs from worker.go 279 | // 280 | func (m *Master) server() { 281 | if err := rpc.Register(m); err != nil { 282 | panic(err) 283 | } 284 | rpc.HandleHTTP() 285 | //l, e := net.Listen("tcp", ":1234") 286 | _ = os.Remove("mr-socket") 287 | l, e := net.Listen("unix", "mr-socket") 288 | if e != nil { 289 | log.Fatal("listen error:", e) 290 | } 291 | go func() { 292 | if err := http.Serve(l, nil); err != nil { 293 | panic(err) 294 | } 295 | }() 296 | } 297 | 298 | // 299 | // main/mrmaster.go calls Done() periodically to find out 300 | // if the entire job has finished. 301 | // 302 | func (m *Master) Done() bool { 303 | ret := false 304 | // Your code here. 305 | count := 0 306 | for _, v := range m.S.MatrixSource[m.S.MC] { 307 | if v == "done" { 308 | count++ 309 | } 310 | } 311 | if count == m.S.RC { 312 | //fmt.Println(m.S.allDone) 313 | //close(dispatcher.CleanWorkerChan) 314 | //close(dispatcher.ReduceSourceChan) 315 | if len(m.TP.Pool) != 0 { 316 | return false 317 | } 318 | if m.S.allDone == 0 { 319 | close(m.TP.Pool) // 将会通知所有 worker 进行下线 320 | m.S.allDone = 1 321 | } 322 | c := 0 323 | m.W.Range(func(key, value interface{}) bool { 324 | w := value.(*WorkerSession) 325 | if w.T != nil { 326 | c++ 327 | } 328 | return true 329 | }) 330 | if c == 0 { 331 | ret = true 332 | // TODO: 一个完美主义者 不想让命令行打印出来一些无关紧要的东西(又不想改框架本身) 333 | _ = os.Remove("mr-socket") 334 | _, _ = os.Create("mr-socket") 335 | } 336 | } 337 | return ret 338 | } 339 | 340 | // 341 | // create a Master. 342 | // 343 | func MakeMaster(files []string, nReduce int) *Master { 344 | m := Master{} 345 | // Your code here. 346 | sources := make([][]string, len(files)+1) // 多出一行保存完成状态 347 | for i := 0; i < len(sources); i++ { 348 | sources[i] = make([]string, nReduce) 349 | } 350 | m.S = &JobState{ 351 | MatrixSource: sources, 352 | MC: len(files), 353 | RC: nReduce, 354 | nextWorkerID: uint64(0), 355 | } 356 | m.TP = &TaskPool{Pool: make(chan *Task, len(files))} 357 | m.W = &sync.Map{} 358 | 359 | dispatcher = &Dispatcher{ 360 | TimeOut: 10 * time.Second, 361 | M: &m, 362 | ReduceSourceChan: make(chan *ReduceSource, nReduce), 363 | CleanWorkerChan: make(chan uint64, len(files)), 364 | } 365 | dispatcher.run() 366 | // 初始化map任务 367 | for num, file := range files { 368 | m.TP.Pool <- &Task{ 369 | Status: 0, // 0 未完成 1工作中 2已完成 370 | Type: 0, // 0 map 任务 1 reduce 任务 2 shut down 3 retry 371 | Conf: &TaskConf{Source: []string{file}, MNum: num, RNum: -1, RC: nReduce}, 372 | } 373 | } 374 | m.server() 375 | return &m 376 | } 377 | -------------------------------------------------------------------------------- /src/mr/rpc.go: -------------------------------------------------------------------------------- 1 | package mr 2 | 3 | // 4 | // RPC definitions. 5 | // 6 | 7 | // 8 | // example to show how to declare the arguments 9 | // and reply for an RPC. 10 | // 11 | 12 | type ExampleArgs struct { 13 | X int 14 | } 15 | 16 | type ExampleReply struct { 17 | Y int 18 | } 19 | 20 | // Add your RPC definitions here. 21 | // 注册 22 | type RegisterReq struct { 23 | } 24 | 25 | type RegisterRes struct { 26 | WorkerID uint64 27 | } 28 | 29 | // 获取任务 30 | type GetTaskReq struct { 31 | WorkerID uint64 32 | } 33 | type GetTaskRes struct { 34 | Code int 35 | Msg string 36 | T *Task 37 | } 38 | 39 | // 返回结果 40 | type ResultReq struct { 41 | WorkerID uint64 42 | Code int // 0 代表 map 1 代表 reduce 2代表 失败 43 | Msg string 44 | M []string 45 | } 46 | 47 | type ResultRes struct { 48 | Code int 49 | Msg string 50 | } 51 | 52 | // 健康检查 53 | type Ping struct { 54 | WorkerID uint64 55 | } 56 | 57 | type Pong struct { 58 | Code int 59 | } 60 | -------------------------------------------------------------------------------- /src/mr/worker.go: -------------------------------------------------------------------------------- 1 | package mr 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "io" 7 | "io/ioutil" 8 | "os" 9 | "sort" 10 | "strings" 11 | "time" 12 | ) 13 | import "log" 14 | import "net/rpc" 15 | import "hash/fnv" 16 | 17 | // 18 | // Map functions return a slice of KeyValue. 19 | // 20 | type KeyValue struct { 21 | Key string 22 | Value string 23 | } 24 | 25 | // for sorting by key. 26 | type ByKey []KeyValue 27 | 28 | // for sorting by key. 29 | func (a ByKey) Len() int { return len(a) } 30 | func (a ByKey) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 31 | func (a ByKey) Less(i, j int) bool { return a[i].Key < a[j].Key } 32 | 33 | // 34 | // use ihash(key) % NReduce to choose the reduce 35 | // task number for each KeyValue emitted by Map. 36 | // 37 | func ihash(key string) int { 38 | h := fnv.New32a() 39 | h.Write([]byte(key)) 40 | return int(h.Sum32() & 0x7fffffff) 41 | } 42 | 43 | var workerID uint64 44 | 45 | func Worker(mapf func(string, string) []KeyValue, 46 | reducef func(string, []string) string) { 47 | // Your worker implementation here. 48 | workerID = Register() 49 | go func() { 50 | tc := time.NewTicker(10 * time.Second) 51 | defer tc.Stop() 52 | for { 53 | <-tc.C 54 | PingPong() 55 | } 56 | }() 57 | var task *Task 58 | for { 59 | // 1.获取任务 60 | task = GetTask() 61 | // 2.根据任务类型执行任务 62 | res, err := ExecTask(mapf, reducef, task) 63 | if err != nil { 64 | continue 65 | } 66 | // 3.报告结果 67 | Report(res) 68 | } 69 | } 70 | 71 | // 72 | // example function to show how to make an RPC call to the master. 73 | // 74 | 75 | const ( 76 | CallRegister = "Master.RegisterWorker" 77 | CallPingPong = "Master.PingPong" 78 | CallGetTask = "Master.GetTaskWorker" 79 | CallReport = "Master.ReportResult" 80 | ) 81 | 82 | func CallExample() { 83 | 84 | // declare an argument structure. 85 | args := ExampleArgs{} 86 | 87 | // fill in the argument(s). 88 | args.X = 99 89 | 90 | // declare a reply structure. 91 | reply := ExampleReply{} 92 | 93 | // send the RPC request, wait for the reply. 94 | call("Master.Example", &args, &reply) 95 | 96 | // reply.Y should be 100. 97 | fmt.Printf("reply.Y %v\n", reply.Y) 98 | } 99 | 100 | // TODO: 是否可以包装成对象 101 | func Register() uint64 { 102 | args, reply := RegisterReq{}, RegisterRes{} 103 | // TODO: 处理请求异常? 104 | call(CallRegister, &args, &reply) 105 | return reply.WorkerID 106 | } 107 | func PingPong() { 108 | args, reply := Ping{WorkerID: workerID}, Pong{} 109 | call(CallPingPong, &args, &reply) 110 | //TODO:解析PingPong异常? 111 | } 112 | func GetTask() *Task { 113 | args, reply := GetTaskReq{WorkerID: workerID}, GetTaskRes{} 114 | call(CallGetTask, &args, &reply) 115 | //TODO:解析异常? 116 | return reply.T 117 | } 118 | func ExecTask(mapf func(string, string) []KeyValue, 119 | reducef func(string, []string) string, task *Task) (*ResultReq, error) { 120 | var res []string 121 | req := &ResultReq{WorkerID: workerID} 122 | if task == nil { 123 | return nil, fmt.Errorf("retry") // TODO:应该定义一种错误类型 124 | } 125 | if task.Type == 0 { 126 | // TODO: 执行 map 任务 127 | res, _ = doMap(mapf, task) 128 | req.Code = 0 129 | } else if task.Type == 1 { 130 | // TODO: 执行 reduce 任务 131 | res, _ = doReduce(reducef, task) 132 | req.Code = 1 133 | } else if task.Type == 2 { 134 | os.Exit(0) 135 | } 136 | if len(res) == 0 { 137 | return nil, fmt.Errorf("retry") // TODO:应该定义一种错误类型 138 | } 139 | req.M = res 140 | return req, nil 141 | } 142 | 143 | func Report(res *ResultReq) { 144 | reply := ResultRes{} 145 | call(CallReport, res, &reply) 146 | //TODO:处理返回异常? 147 | } 148 | 149 | func doMap(mapf func(string, string) []KeyValue, task *Task) ([]string, error) { 150 | // TODO:对task进行检查 151 | res := make([]string, 0) 152 | fileName := task.Conf.Source[0] 153 | file, err := os.Open(fileName) 154 | defer func() { 155 | _ = file.Close() 156 | }() 157 | if err != nil { 158 | return nil, fmt.Errorf("doMap.Open.err:%s", err.Error()) 159 | } 160 | content, err := ioutil.ReadAll(file) 161 | if err != nil { 162 | return nil, fmt.Errorf(fmt.Sprintf("doMap.ReadAll.err:%s", err.Error())) 163 | } 164 | cacheMap := make(map[string][]KeyValue, 0) 165 | for i := 0; i < task.Conf.RC; i++ { 166 | key := fmt.Sprintf("mr-worker-%d-%d.out", task.Conf.MNum, i) 167 | cacheMap[key] = []KeyValue{} 168 | res = append(res, key) 169 | } 170 | kva := mapf(fileName, string(content)) 171 | for i := 0; i < len(kva); i++ { 172 | idx := ihash(kva[i].Key) % task.Conf.RC // TODO: ihash(kva[i].Key) & (task.Conf.RC - 1) 173 | key := fmt.Sprintf("mr-worker-%d-%d.out", task.Conf.MNum, idx) 174 | cacheMap[key] = append(cacheMap[key], kva[i]) 175 | } 176 | 177 | for key, value := range cacheMap { 178 | sort.Sort(ByKey(value)) 179 | // TODO: 在这里可以调用一次 reduce函数进行合并 以减少网络调用 180 | combine(value) 181 | // TODO: 这里是否也存在的 map函数生成文件的幂等问题 182 | outFile, _ := os.Create(key) 183 | for i := 0; i < len(value); i++ { 184 | _, _ = fmt.Fprintf(outFile, "%v %v\n", value[i].Key, value[i].Value) 185 | } 186 | _ = outFile.Close() 187 | } 188 | return res, nil 189 | } 190 | 191 | func doReduce(reducef func(string, []string) string, task *Task) ([]string, error) { 192 | // TODO: 检查task 193 | // TODO: 外部排序? -> 先实现一个内存排序吧 194 | kvas := readFiles(task) 195 | tmpFileName := fmt.Sprintf("mr-out-%d.%d.swap", time.Now().Unix(), task.Conf.RNum) 196 | outFile, _ := os.OpenFile(tmpFileName, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0664) 197 | defer outFile.Close() 198 | // TODO:先在内存排个顺序 199 | sort.Sort(ByKey(kvas)) 200 | // TODO: 先这样处理 201 | if len(kvas) == 0 { 202 | key := fmt.Sprintf("mr-out-%d", task.Conf.RNum) 203 | _ = os.Rename(tmpFileName, key) 204 | return []string{key}, nil 205 | } 206 | buf := []KeyValue{kvas[0]} 207 | // TODO:处理边界 208 | // [a,a,a,a,a,b,b,b,b,c,c,c] 209 | for i := 1; i < len(kvas); i++ { 210 | if buf[len(buf)-1].Key == kvas[i].Key { // buf 中最后的一个key 与当前key 相同 211 | buf = append(buf, kvas[i]) 212 | } else { 213 | out := reducef(buf[len(buf)-1].Key, toValues(buf)) 214 | _, _ = fmt.Fprintf(outFile, "%v %v\n", buf[len(buf)-1].Key, out) 215 | buf = []KeyValue{kvas[i]} 216 | } 217 | } 218 | // 写入最后的buf进去 219 | out := reducef(buf[len(buf)-1].Key, toValues(buf)) 220 | _, _ = fmt.Fprintf(outFile, "%v %v\n", buf[len(buf)-1].Key, out) 221 | //TODO:处理路径问题? 222 | key := fmt.Sprintf("mr-out-%d", task.Conf.RNum) 223 | _ = os.Rename(tmpFileName, key) 224 | return []string{key}, nil 225 | } 226 | 227 | func toValues(kvas []KeyValue) []string { 228 | res := make([]string, 0) 229 | for _, kv := range kvas { 230 | res = append(res, kv.Value) 231 | } 232 | return res 233 | } 234 | func readFiles(task *Task) []KeyValue { 235 | // TODO: 生产级别应该实现 外部排序并返回一个迭代器 236 | // TODO: 进行错误处理 237 | res := make([]KeyValue, 0) 238 | for _, v := range task.Conf.Source { 239 | file, _ := os.Open(v) 240 | br := bufio.NewReader(file) 241 | for { 242 | line, _, c := br.ReadLine() 243 | if c == io.EOF { 244 | break 245 | } 246 | data := strings.Split(string(line), " ") 247 | res = append(res, KeyValue{ 248 | Key: data[0], 249 | Value: data[1], 250 | }) 251 | } 252 | _ = file.Close() 253 | } 254 | return res 255 | } 256 | func combine(intermediate []KeyValue) { 257 | 258 | } 259 | func sendTaskFail(task *Task) { 260 | } 261 | 262 | // 263 | // send an RPC request to the master, wait for the response. 264 | // usually returns true. 265 | // returns false if something goes wrong. 266 | // 267 | func call(rpcname string, args interface{}, reply interface{}) bool { 268 | //c, err := rpc.DialHTTP("tcp", "127.0.0.1"+":1234") 269 | c, err := rpc.DialHTTP("unix", "mr-socket") 270 | if err != nil { 271 | log.Fatal("dialing:", err) 272 | } 273 | defer c.Close() 274 | 275 | err = c.Call(rpcname, args, reply) 276 | if err == nil { 277 | return true 278 | } 279 | 280 | fmt.Println(err) 281 | return false 282 | } 283 | -------------------------------------------------------------------------------- /src/mrapps/crash.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // a MapReduce pseudo-application that sometimes crashes, 5 | // and sometimes takes a long time, 6 | // to test MapReduce's ability to recover. 7 | // 8 | // go build -buildmode=plugin crash.go 9 | // 10 | 11 | import "Mit6824/src/mr" 12 | import crand "crypto/rand" 13 | import "math/big" 14 | import "strings" 15 | import "os" 16 | import "sort" 17 | import "strconv" 18 | import "time" 19 | 20 | func maybeCrash() { 21 | max := big.NewInt(1000) 22 | rr, _ := crand.Int(crand.Reader, max) 23 | if rr.Int64() < 330 { 24 | // crash! 25 | os.Exit(1) 26 | } else if rr.Int64() < 660 { 27 | // delay for a while. 28 | maxms := big.NewInt(10 * 1000) 29 | ms, _ := crand.Int(crand.Reader, maxms) 30 | time.Sleep(time.Duration(ms.Int64()) * time.Millisecond) 31 | } 32 | } 33 | 34 | func Map(filename string, contents string) []mr.KeyValue { 35 | maybeCrash() 36 | 37 | kva := []mr.KeyValue{} 38 | kva = append(kva, mr.KeyValue{"a", filename}) 39 | kva = append(kva, mr.KeyValue{"b", strconv.Itoa(len(filename))}) 40 | kva = append(kva, mr.KeyValue{"c", strconv.Itoa(len(contents))}) 41 | kva = append(kva, mr.KeyValue{"d", "xyzzy"}) 42 | return kva 43 | } 44 | 45 | func Reduce(key string, values []string) string { 46 | maybeCrash() 47 | 48 | // sort values to ensure deterministic output. 49 | vv := make([]string, len(values)) 50 | copy(vv, values) 51 | sort.Strings(vv) 52 | 53 | val := strings.Join(vv, " ") 54 | return val 55 | } 56 | -------------------------------------------------------------------------------- /src/mrapps/indexer.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // an indexing application "plugin" for MapReduce. 5 | // 6 | // go build -buildmode=plugin indexer.go 7 | // 8 | 9 | import "fmt" 10 | import "Mit6824/src/mr" 11 | 12 | import "strings" 13 | import "unicode" 14 | import "sort" 15 | 16 | // The mapping function is called once for each piece of the input. 17 | // In this framework, the key is the name of the file that is being processed, 18 | // and the value is the file's contents. The return value should be a slice of 19 | // key/value pairs, each represented by a mr.KeyValue. 20 | func Map(document string, value string) (res []mr.KeyValue) { 21 | m := make(map[string]bool) 22 | words := strings.FieldsFunc(value, func(x rune) bool { return !unicode.IsLetter(x) }) 23 | for _, w := range words { 24 | m[w] = true 25 | } 26 | for w := range m { 27 | kv := mr.KeyValue{w, document} 28 | res = append(res, kv) 29 | } 30 | return 31 | } 32 | 33 | // The reduce function is called once for each key generated by Map, with a 34 | // list of that key's string value (merged across all inputs). The return value 35 | // should be a single output value for that key. 36 | func Reduce(key string, values []string) string { 37 | sort.Strings(values) 38 | return fmt.Sprintf("%d %s", len(values), strings.Join(values, ",")) 39 | } 40 | -------------------------------------------------------------------------------- /src/mrapps/mtiming.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // a MapReduce pseudo-application to test that workers 5 | // execute map tasks in parallel. 6 | // 7 | // go build -buildmode=plugin mtiming.go 8 | // 9 | 10 | import "Mit6824/src/mr" 11 | import "strings" 12 | import "fmt" 13 | import "os" 14 | import "syscall" 15 | import "time" 16 | import "sort" 17 | import "io/ioutil" 18 | 19 | func nparallel(phase string) int { 20 | // create a file so that other workers will see that 21 | // we're running at the same time as them. 22 | pid := os.Getpid() 23 | myfilename := fmt.Sprintf("mr-worker-%s-%d", phase, pid) 24 | err := ioutil.WriteFile(myfilename, []byte("x"), 0666) 25 | if err != nil { 26 | panic(err) 27 | } 28 | 29 | // are any other workers running? 30 | // find their PIDs by scanning directory for mr-worker-XXX files. 31 | dd, err := os.Open(".") 32 | if err != nil { 33 | panic(err) 34 | } 35 | names, err := dd.Readdirnames(1000000) 36 | if err != nil { 37 | panic(err) 38 | } 39 | ret := 0 40 | for _, name := range names { 41 | var xpid int 42 | pat := fmt.Sprintf("mr-worker-%s-%%d", phase) 43 | n, err := fmt.Sscanf(name, pat, &xpid) 44 | if n == 1 && err == nil { 45 | err := syscall.Kill(xpid, 0) 46 | if err == nil { 47 | // if err == nil, xpid is alive. 48 | ret += 1 49 | } 50 | } 51 | } 52 | dd.Close() 53 | 54 | time.Sleep(1 * time.Second) 55 | 56 | err = os.Remove(myfilename) 57 | if err != nil { 58 | panic(err) 59 | } 60 | 61 | return ret 62 | } 63 | 64 | func Map(filename string, contents string) []mr.KeyValue { 65 | t0 := time.Now() 66 | ts := float64(t0.Unix()) + (float64(t0.Nanosecond()) / 1000000000.0) 67 | pid := os.Getpid() 68 | 69 | n := nparallel("map") 70 | 71 | kva := []mr.KeyValue{} 72 | kva = append(kva, mr.KeyValue{ 73 | fmt.Sprintf("times-%v", pid), 74 | fmt.Sprintf("%.1f", ts)}) 75 | kva = append(kva, mr.KeyValue{ 76 | fmt.Sprintf("parallel-%v", pid), 77 | fmt.Sprintf("%d", n)}) 78 | return kva 79 | } 80 | 81 | func Reduce(key string, values []string) string { 82 | //n := nparallel("reduce") 83 | 84 | // sort values to ensure deterministic output. 85 | vv := make([]string, len(values)) 86 | copy(vv, values) 87 | sort.Strings(vv) 88 | 89 | val := strings.Join(vv, " ") 90 | return val 91 | } 92 | -------------------------------------------------------------------------------- /src/mrapps/nocrash.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // same as crash.go but doesn't actually crash. 5 | // 6 | // go build -buildmode=plugin nocrash.go 7 | // 8 | 9 | import "Mit6824/src/mr" 10 | import crand "crypto/rand" 11 | import "math/big" 12 | import "strings" 13 | import "os" 14 | import "sort" 15 | import "strconv" 16 | 17 | func maybeCrash() { 18 | max := big.NewInt(1000) 19 | rr, _ := crand.Int(crand.Reader, max) 20 | if false && rr.Int64() < 500 { 21 | // crash! 22 | os.Exit(1) 23 | } 24 | } 25 | 26 | func Map(filename string, contents string) []mr.KeyValue { 27 | maybeCrash() 28 | 29 | kva := []mr.KeyValue{} 30 | kva = append(kva, mr.KeyValue{"a", filename}) 31 | kva = append(kva, mr.KeyValue{"b", strconv.Itoa(len(filename))}) 32 | kva = append(kva, mr.KeyValue{"c", strconv.Itoa(len(contents))}) 33 | kva = append(kva, mr.KeyValue{"d", "xyzzy"}) 34 | return kva 35 | } 36 | 37 | func Reduce(key string, values []string) string { 38 | maybeCrash() 39 | 40 | // sort values to ensure deterministic output. 41 | vv := make([]string, len(values)) 42 | copy(vv, values) 43 | sort.Strings(vv) 44 | 45 | val := strings.Join(vv, " ") 46 | return val 47 | } 48 | -------------------------------------------------------------------------------- /src/mrapps/rtiming.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // a MapReduce pseudo-application to test that workers 5 | // execute reduce tasks in parallel. 6 | // 7 | // go build -buildmode=plugin rtiming.go 8 | // 9 | 10 | import "Mit6824/src/mr" 11 | import "fmt" 12 | import "os" 13 | import "syscall" 14 | import "time" 15 | import "io/ioutil" 16 | 17 | func nparallel(phase string) int { 18 | // create a file so that other workers will see that 19 | // we're running at the same time as them. 20 | pid := os.Getpid() 21 | myfilename := fmt.Sprintf("mr-worker-%s-%d", phase, pid) 22 | err := ioutil.WriteFile(myfilename, []byte("x"), 0666) 23 | if err != nil { 24 | panic(err) 25 | } 26 | 27 | // are any other workers running? 28 | // find their PIDs by scanning directory for mr-worker-XXX files. 29 | dd, err := os.Open(".") 30 | if err != nil { 31 | panic(err) 32 | } 33 | names, err := dd.Readdirnames(1000000) 34 | if err != nil { 35 | panic(err) 36 | } 37 | ret := 0 38 | for _, name := range names { 39 | var xpid int 40 | pat := fmt.Sprintf("mr-worker-%s-%%d", phase) 41 | n, err := fmt.Sscanf(name, pat, &xpid) 42 | if n == 1 && err == nil { 43 | err := syscall.Kill(xpid, 0) 44 | if err == nil { 45 | // if err == nil, xpid is alive. 46 | ret += 1 47 | } 48 | } 49 | } 50 | dd.Close() 51 | 52 | time.Sleep(1 * time.Second) 53 | 54 | err = os.Remove(myfilename) 55 | if err != nil { 56 | panic(err) 57 | } 58 | 59 | return ret 60 | } 61 | 62 | func Map(filename string, contents string) []mr.KeyValue { 63 | 64 | kva := []mr.KeyValue{} 65 | kva = append(kva, mr.KeyValue{"a", "1"}) 66 | kva = append(kva, mr.KeyValue{"b", "1"}) 67 | kva = append(kva, mr.KeyValue{"c", "1"}) 68 | kva = append(kva, mr.KeyValue{"d", "1"}) 69 | kva = append(kva, mr.KeyValue{"e", "1"}) 70 | kva = append(kva, mr.KeyValue{"f", "1"}) 71 | kva = append(kva, mr.KeyValue{"g", "1"}) 72 | kva = append(kva, mr.KeyValue{"h", "1"}) 73 | kva = append(kva, mr.KeyValue{"i", "1"}) 74 | kva = append(kva, mr.KeyValue{"j", "1"}) 75 | return kva 76 | } 77 | 78 | func Reduce(key string, values []string) string { 79 | n := nparallel("reduce") 80 | 81 | val := fmt.Sprintf("%d", n) 82 | 83 | return val 84 | } 85 | -------------------------------------------------------------------------------- /src/mrapps/wc.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // a word-count application "plugin" for MapReduce. 5 | // 6 | // go build -buildmode=plugin wc.go 7 | // 8 | 9 | import "Mit6824/src/mr" 10 | import "unicode" 11 | import "strings" 12 | import "strconv" 13 | 14 | // 15 | // The map function is called once for each file of input. The first 16 | // argument is the name of the input file, and the second is the 17 | // file's complete contents. You should ignore the input file name, 18 | // and look only at the contents argument. The return value is a slice 19 | // of key/value pairs. 20 | // 21 | func Map(filename string, contents string) []mr.KeyValue { 22 | // function to detect word separators. 23 | ff := func(r rune) bool { return !unicode.IsLetter(r) } 24 | 25 | // split contents into an array of words. 26 | words := strings.FieldsFunc(contents, ff) 27 | 28 | kva := []mr.KeyValue{} 29 | for _, w := range words { 30 | kv := mr.KeyValue{w, "1"} 31 | kva = append(kva, kv) 32 | } 33 | return kva 34 | } 35 | 36 | // 37 | // The reduce function is called once for each key generated by the 38 | // map tasks, with a list of all the values created for that key by 39 | // any map task. 40 | // 41 | func Reduce(key string, values []string) string { 42 | // return the number of occurrences of this word. 43 | return strconv.Itoa(len(values)) 44 | } 45 | -------------------------------------------------------------------------------- /src/mroriginal/master.go: -------------------------------------------------------------------------------- 1 | package mroriginal 2 | 3 | import ( 4 | "Mit6824/src/mr" 5 | "log" 6 | ) 7 | import "net" 8 | import "os" 9 | import "net/rpc" 10 | import "net/http" 11 | 12 | type Master struct { 13 | // Your definitions here. 14 | 15 | } 16 | 17 | // Your code here -- RPC handlers for the worker to call. 18 | 19 | // 20 | // an example RPC handler. 21 | // 22 | func (m *Master) Example(args *mr.ExampleArgs, reply *mr.ExampleReply) error { 23 | reply.Y = args.X + 1 24 | return nil 25 | } 26 | 27 | // 28 | // start a thread that listens for RPCs from worker.go 29 | // 30 | func (m *Master) server() { 31 | rpc.Register(m) 32 | rpc.HandleHTTP() 33 | //l, e := net.Listen("tcp", ":1234") 34 | os.Remove("mr-socket") 35 | l, e := net.Listen("unix", "mr-socket") 36 | if e != nil { 37 | log.Fatal("listen error:", e) 38 | } 39 | go http.Serve(l, nil) 40 | } 41 | 42 | // 43 | // main/mrmaster.go calls Done() periodically to find out 44 | // if the entire job has finished. 45 | // 46 | func (m *Master) Done() bool { 47 | ret := false 48 | 49 | // Your code here. 50 | 51 | return ret 52 | } 53 | 54 | // 55 | // create a Master. 56 | // 57 | func MakeMaster(files []string, nReduce int) *Master { 58 | m := Master{} 59 | 60 | // Your code here. 61 | 62 | return &m 63 | } 64 | -------------------------------------------------------------------------------- /src/mroriginal/rpc.go: -------------------------------------------------------------------------------- 1 | package mroriginal 2 | 3 | // 4 | // RPC definitions. 5 | // 6 | 7 | // 8 | // example to show how to declare the arguments 9 | // and reply for an RPC. 10 | // 11 | 12 | type ExampleArgs struct { 13 | X int 14 | } 15 | 16 | type ExampleReply struct { 17 | Y int 18 | } 19 | 20 | // Add your RPC definitions here. 21 | -------------------------------------------------------------------------------- /src/mroriginal/worker.go: -------------------------------------------------------------------------------- 1 | package mroriginal 2 | 3 | import ( 4 | "Mit6824/src/mr" 5 | "fmt" 6 | ) 7 | import "log" 8 | import "net/rpc" 9 | import "hash/fnv" 10 | 11 | // 12 | // Map functions return a slice of KeyValue. 13 | // 14 | type KeyValue struct { 15 | Key string 16 | Value string 17 | } 18 | 19 | // 20 | // use ihash(key) % NReduce to choose the reduce 21 | // task number for each KeyValue emitted by Map. 22 | // 23 | func ihash(key string) int { 24 | h := fnv.New32a() 25 | h.Write([]byte(key)) 26 | return int(h.Sum32() & 0x7fffffff) 27 | } 28 | 29 | func Worker(mapf func(string, string) []KeyValue, 30 | reducef func(string, []string) string) { 31 | 32 | // Your worker implementation here. 33 | 34 | // uncomment to send the Example RPC to the master. 35 | // CallExample() 36 | } 37 | 38 | // 39 | // example function to show how to make an RPC call to the master. 40 | // 41 | func CallExample() { 42 | 43 | // declare an argument structure. 44 | args := mr.ExampleArgs{} 45 | 46 | // fill in the argument(s). 47 | args.X = 99 48 | 49 | // declare a reply structure. 50 | reply := mr.ExampleReply{} 51 | 52 | // send the RPC request, wait for the reply. 53 | call("Master.Example", &args, &reply) 54 | 55 | // reply.Y should be 100. 56 | fmt.Printf("reply.Y %v\n", reply.Y) 57 | } 58 | 59 | // 60 | // send an RPC request to the master, wait for the response. 61 | // usually returns true. 62 | // returns false if something goes wrong. 63 | // 64 | func call(rpcname string, args interface{}, reply interface{}) bool { 65 | // c, err := rpc.DialHTTP("tcp", "127.0.0.1"+":1234") 66 | c, err := rpc.DialHTTP("unix", "mr-socket") 67 | if err != nil { 68 | log.Fatal("dialing:", err) 69 | } 70 | defer c.Close() 71 | 72 | err = c.Call(rpcname, args, reply) 73 | if err == nil { 74 | return true 75 | } 76 | 77 | fmt.Println(err) 78 | return false 79 | } 80 | -------------------------------------------------------------------------------- /src/raft/config.go: -------------------------------------------------------------------------------- 1 | package raft 2 | 3 | // 4 | // support for Raft tester. 5 | // 6 | // we will use the original config.go to test your code for grading. 7 | // so, while you can modify this code to help you debug, please 8 | // test with the original before submitting. 9 | // 10 | 11 | import "Mit6824/src/labrpc" 12 | import "log" 13 | import "sync" 14 | import "testing" 15 | import "runtime" 16 | import "math/rand" 17 | import crand "crypto/rand" 18 | import "math/big" 19 | import "encoding/base64" 20 | import "time" 21 | import "fmt" 22 | 23 | func randstring(n int) string { 24 | b := make([]byte, 2*n) 25 | crand.Read(b) 26 | s := base64.URLEncoding.EncodeToString(b) 27 | return s[0:n] 28 | } 29 | 30 | func makeSeed() int64 { 31 | max := big.NewInt(int64(1) << 62) 32 | bigx, _ := crand.Int(crand.Reader, max) 33 | x := bigx.Int64() 34 | return x 35 | } 36 | 37 | type config struct { 38 | mu sync.Mutex 39 | t *testing.T 40 | net *labrpc.Network 41 | n int 42 | rafts []*Raft 43 | applyErr []string // from apply channel readers 44 | connected []bool // whether each server is on the net 45 | saved []*Persister 46 | endnames [][]string // the port file names each sends to 47 | logs []map[int]int // copy of each server's committed entries 48 | start time.Time // time at which make_config() was called 49 | // begin()/end() statistics 50 | t0 time.Time // time at which test_test.go called cfg.begin() 51 | rpcs0 int // rpcTotal() at start of test 52 | cmds0 int // number of agreements 53 | maxIndex int 54 | maxIndex0 int 55 | } 56 | 57 | var ncpu_once sync.Once 58 | 59 | func make_config(t *testing.T, n int, unreliable bool) *config { 60 | ncpu_once.Do(func() { 61 | if runtime.NumCPU() < 2 { 62 | fmt.Printf("warning: only one CPU, which may conceal locking bugs\n") 63 | } 64 | rand.Seed(makeSeed()) 65 | }) 66 | runtime.GOMAXPROCS(4) 67 | cfg := &config{} 68 | cfg.t = t 69 | cfg.net = labrpc.MakeNetwork() 70 | cfg.n = n 71 | cfg.applyErr = make([]string, cfg.n) 72 | cfg.rafts = make([]*Raft, cfg.n) 73 | cfg.connected = make([]bool, cfg.n) 74 | cfg.saved = make([]*Persister, cfg.n) 75 | cfg.endnames = make([][]string, cfg.n) 76 | cfg.logs = make([]map[int]int, cfg.n) 77 | cfg.start = time.Now() 78 | 79 | cfg.setunreliable(unreliable) 80 | 81 | cfg.net.LongDelays(true) 82 | 83 | // create a full set of Rafts. 84 | for i := 0; i < cfg.n; i++ { 85 | cfg.logs[i] = map[int]int{} 86 | cfg.start1(i) 87 | } 88 | 89 | // connect everyone 90 | for i := 0; i < cfg.n; i++ { 91 | cfg.connect(i) 92 | } 93 | 94 | return cfg 95 | } 96 | 97 | // shut down a Raft server but save its persistent state. 98 | func (cfg *config) crash1(i int) { 99 | cfg.disconnect(i) 100 | cfg.net.DeleteServer(i) // disable client connections to the server. 101 | 102 | cfg.mu.Lock() 103 | defer cfg.mu.Unlock() 104 | 105 | // a fresh persister, in case old instance 106 | // continues to update the Persister. 107 | // but copy old persister's content so that we always 108 | // pass Make() the last persisted state. 109 | if cfg.saved[i] != nil { 110 | cfg.saved[i] = cfg.saved[i].Copy() 111 | } 112 | 113 | rf := cfg.rafts[i] 114 | if rf != nil { 115 | cfg.mu.Unlock() 116 | rf.Kill() 117 | cfg.mu.Lock() 118 | cfg.rafts[i] = nil 119 | } 120 | 121 | if cfg.saved[i] != nil { 122 | raftlog := cfg.saved[i].ReadRaftState() 123 | cfg.saved[i] = &Persister{} 124 | cfg.saved[i].SaveRaftState(raftlog) 125 | } 126 | } 127 | 128 | // 129 | // start or re-start a Raft. 130 | // if one already exists, "kill" it first. 131 | // allocate new outgoing port file names, and a new 132 | // state persister, to isolate previous instance of 133 | // this server. since we cannot really kill it. 134 | // 135 | func (cfg *config) start1(i int) { 136 | cfg.crash1(i) 137 | 138 | // a fresh set of outgoing ClientEnd names. 139 | // so that old crashed instance's ClientEnds can't send. 140 | cfg.endnames[i] = make([]string, cfg.n) 141 | for j := 0; j < cfg.n; j++ { 142 | cfg.endnames[i][j] = randstring(20) 143 | } 144 | 145 | // a fresh set of ClientEnds. 146 | ends := make([]*labrpc.ClientEnd, cfg.n) 147 | for j := 0; j < cfg.n; j++ { 148 | ends[j] = cfg.net.MakeEnd(cfg.endnames[i][j]) 149 | cfg.net.Connect(cfg.endnames[i][j], j) 150 | } 151 | 152 | cfg.mu.Lock() 153 | 154 | // a fresh persister, so old instance doesn't overwrite 155 | // new instance's persisted state. 156 | // but copy old persister's content so that we always 157 | // pass Make() the last persisted state. 158 | if cfg.saved[i] != nil { 159 | cfg.saved[i] = cfg.saved[i].Copy() 160 | } else { 161 | cfg.saved[i] = MakePersister() 162 | } 163 | 164 | cfg.mu.Unlock() 165 | 166 | // listen to messages from Raft indicating newly committed messages. 167 | applyCh := make(chan ApplyMsg) 168 | go func() { 169 | for m := range applyCh { 170 | err_msg := "" 171 | if m.CommandValid == false { 172 | // ignore other types of ApplyMsg 173 | } else if v, ok := (m.Command).(int); ok { 174 | cfg.mu.Lock() 175 | for j := 0; j < len(cfg.logs); j++ { 176 | if old, oldok := cfg.logs[j][m.CommandIndex]; oldok && old != v { 177 | // some server has already committed a different value for this entry! 178 | err_msg = fmt.Sprintf("commit index=%v server=%v %v != server=%v %v", 179 | m.CommandIndex, i, m.Command, j, old) 180 | } 181 | } 182 | _, prevok := cfg.logs[i][m.CommandIndex-1] 183 | cfg.logs[i][m.CommandIndex] = v 184 | if m.CommandIndex > cfg.maxIndex { 185 | cfg.maxIndex = m.CommandIndex 186 | } 187 | cfg.mu.Unlock() 188 | 189 | if m.CommandIndex > 1 && prevok == false { 190 | err_msg = fmt.Sprintf("server %v apply out of order %v", i, m.CommandIndex) 191 | } 192 | } else { 193 | err_msg = fmt.Sprintf("committed command %v is not an int", m.Command) 194 | } 195 | 196 | if err_msg != "" { 197 | log.Fatalf("apply error: %v\n", err_msg) 198 | cfg.applyErr[i] = err_msg 199 | // keep reading after error so that Raft doesn't block 200 | // holding locks... 201 | } 202 | } 203 | }() 204 | 205 | rf := Make(ends, i, cfg.saved[i], applyCh) 206 | 207 | cfg.mu.Lock() 208 | cfg.rafts[i] = rf 209 | cfg.mu.Unlock() 210 | 211 | svc := labrpc.MakeService(rf) 212 | srv := labrpc.MakeServer() 213 | srv.AddService(svc) 214 | cfg.net.AddServer(i, srv) 215 | } 216 | 217 | func (cfg *config) checkTimeout() { 218 | // enforce a two minute real-time limit on each test 219 | if !cfg.t.Failed() && time.Since(cfg.start) > 120*time.Second { 220 | cfg.t.Fatal("test took longer than 120 seconds") 221 | } 222 | } 223 | 224 | func (cfg *config) cleanup() { 225 | for i := 0; i < len(cfg.rafts); i++ { 226 | if cfg.rafts[i] != nil { 227 | cfg.rafts[i].Kill() 228 | } 229 | } 230 | cfg.net.Cleanup() 231 | cfg.checkTimeout() 232 | } 233 | 234 | // attach server i to the net. 235 | func (cfg *config) connect(i int) { 236 | // fmt.Printf("connect(%d)\n", i) 237 | 238 | cfg.connected[i] = true 239 | 240 | // outgoing ClientEnds 241 | for j := 0; j < cfg.n; j++ { 242 | if cfg.connected[j] { 243 | endname := cfg.endnames[i][j] 244 | cfg.net.Enable(endname, true) 245 | } 246 | } 247 | 248 | // incoming ClientEnds 249 | for j := 0; j < cfg.n; j++ { 250 | if cfg.connected[j] { 251 | endname := cfg.endnames[j][i] 252 | cfg.net.Enable(endname, true) 253 | } 254 | } 255 | } 256 | 257 | // detach server i from the net. 258 | func (cfg *config) disconnect(i int) { 259 | // fmt.Printf("disconnect(%d)\n", i) 260 | 261 | cfg.connected[i] = false 262 | 263 | // outgoing ClientEnds 264 | for j := 0; j < cfg.n; j++ { 265 | if cfg.endnames[i] != nil { 266 | endname := cfg.endnames[i][j] 267 | cfg.net.Enable(endname, false) 268 | } 269 | } 270 | 271 | // incoming ClientEnds 272 | for j := 0; j < cfg.n; j++ { 273 | if cfg.endnames[j] != nil { 274 | endname := cfg.endnames[j][i] 275 | cfg.net.Enable(endname, false) 276 | } 277 | } 278 | } 279 | 280 | func (cfg *config) rpcCount(server int) int { 281 | return cfg.net.GetCount(server) 282 | } 283 | 284 | func (cfg *config) rpcTotal() int { 285 | return cfg.net.GetTotalCount() 286 | } 287 | 288 | func (cfg *config) setunreliable(unrel bool) { 289 | cfg.net.Reliable(!unrel) 290 | } 291 | 292 | func (cfg *config) setlongreordering(longrel bool) { 293 | cfg.net.LongReordering(longrel) 294 | } 295 | 296 | // check that there's exactly one leader. 297 | // try a few times in case re-elections are needed. 298 | func (cfg *config) checkOneLeader() int { 299 | for iters := 0; iters < 10; iters++ { 300 | ms := 450 + (rand.Int63() % 100) 301 | time.Sleep(time.Duration(ms) * time.Millisecond) 302 | 303 | leaders := make(map[int][]int) 304 | for i := 0; i < cfg.n; i++ { 305 | if cfg.connected[i] { 306 | if term, leader := cfg.rafts[i].GetState(); leader { 307 | leaders[term] = append(leaders[term], i) 308 | } 309 | } 310 | } 311 | 312 | lastTermWithLeader := -1 313 | for term, leaders := range leaders { 314 | if len(leaders) > 1 { 315 | cfg.t.Fatalf("term %d has %d (>1) leaders", term, len(leaders)) 316 | } 317 | if term > lastTermWithLeader { 318 | lastTermWithLeader = term 319 | } 320 | } 321 | 322 | if len(leaders) != 0 { 323 | return leaders[lastTermWithLeader][0] 324 | } 325 | } 326 | cfg.t.Fatalf("expected one leader, got none") 327 | return -1 328 | } 329 | 330 | // check that everyone agrees on the term. 331 | func (cfg *config) checkTerms() int { 332 | term := -1 333 | for i := 0; i < cfg.n; i++ { 334 | if cfg.connected[i] { 335 | xterm, _ := cfg.rafts[i].GetState() 336 | if term == -1 { 337 | term = xterm 338 | } else if term != xterm { 339 | cfg.t.Fatalf("servers disagree on term") 340 | } 341 | } 342 | } 343 | return term 344 | } 345 | 346 | // check that there's no leader 347 | func (cfg *config) checkNoLeader() { 348 | for i := 0; i < cfg.n; i++ { 349 | if cfg.connected[i] { 350 | _, is_leader := cfg.rafts[i].GetState() 351 | if is_leader { 352 | cfg.t.Fatalf("expected no leader, but %v claims to be leader", i) 353 | } 354 | } 355 | } 356 | } 357 | 358 | // how many servers think a log entry is committed? 359 | func (cfg *config) nCommitted(index int) (int, interface{}) { 360 | count := 0 361 | cmd := -1 362 | for i := 0; i < len(cfg.rafts); i++ { 363 | if cfg.applyErr[i] != "" { 364 | cfg.t.Fatal(cfg.applyErr[i]) 365 | } 366 | 367 | cfg.mu.Lock() 368 | cmd1, ok := cfg.logs[i][index] 369 | cfg.mu.Unlock() 370 | 371 | if ok { 372 | if count > 0 && cmd != cmd1 { 373 | cfg.t.Fatalf("committed values do not match: index %v, %v, %v\n", 374 | index, cmd, cmd1) 375 | } 376 | count += 1 377 | cmd = cmd1 378 | } 379 | } 380 | return count, cmd 381 | } 382 | 383 | // wait for at least n servers to commit. 384 | // but don't wait forever. 385 | func (cfg *config) wait(index int, n int, startTerm int) interface{} { 386 | to := 10 * time.Millisecond 387 | for iters := 0; iters < 30; iters++ { 388 | nd, _ := cfg.nCommitted(index) 389 | if nd >= n { 390 | break 391 | } 392 | time.Sleep(to) 393 | if to < time.Second { 394 | to *= 2 395 | } 396 | if startTerm > -1 { 397 | for _, r := range cfg.rafts { 398 | if t, _ := r.GetState(); t > startTerm { 399 | // someone has moved on 400 | // can no longer guarantee that we'll "win" 401 | return -1 402 | } 403 | } 404 | } 405 | } 406 | nd, cmd := cfg.nCommitted(index) 407 | if nd < n { 408 | cfg.t.Fatalf("only %d decided for index %d; wanted %d\n", 409 | nd, index, n) 410 | } 411 | return cmd 412 | } 413 | 414 | // do a complete agreement. 415 | // it might choose the wrong leader initially, 416 | // and have to re-submit after giving up. 417 | // entirely gives up after about 10 seconds. 418 | // indirectly checks that the servers agree on the 419 | // same value, since nCommitted() checks this, 420 | // as do the threads that read from applyCh. 421 | // returns index. 422 | // if retry==true, may submit the command multiple 423 | // times, in case a leader fails just after Start(). 424 | // if retry==false, calls Start() only once, in order 425 | // to simplify the early Lab 2B tests. 426 | func (cfg *config) one(cmd int, expectedServers int, retry bool) int { 427 | t0 := time.Now() 428 | starts := 0 429 | for time.Since(t0).Seconds() < 10 { 430 | // try all the servers, maybe one is the leader. 431 | index := -1 432 | for si := 0; si < cfg.n; si++ { 433 | starts = (starts + 1) % cfg.n 434 | var rf *Raft 435 | cfg.mu.Lock() 436 | if cfg.connected[starts] { 437 | rf = cfg.rafts[starts] 438 | } 439 | cfg.mu.Unlock() 440 | if rf != nil { 441 | index1, _, ok := rf.Start(cmd) 442 | if ok { 443 | index = index1 444 | break 445 | } 446 | } 447 | } 448 | 449 | if index != -1 { 450 | // somebody claimed to be the leader and to have 451 | // submitted our command; wait a while for agreement. 452 | t1 := time.Now() 453 | for time.Since(t1).Seconds() < 2 { 454 | nd, cmd1 := cfg.nCommitted(index) 455 | if nd > 0 && nd >= expectedServers { 456 | // committed 457 | if cmd2, ok := cmd1.(int); ok && cmd2 == cmd { 458 | // and it was the command we submitted. 459 | return index 460 | } 461 | } 462 | time.Sleep(20 * time.Millisecond) 463 | } 464 | if retry == false { 465 | cfg.t.Fatalf("one(%v) failed to reach agreement", cmd) 466 | } 467 | } else { 468 | time.Sleep(50 * time.Millisecond) 469 | } 470 | } 471 | cfg.t.Fatalf("one(%v) failed to reach agreement", cmd) 472 | return -1 473 | } 474 | 475 | // start a Test. 476 | // print the Test message. 477 | // e.g. cfg.begin("Test (2B): RPC counts aren't too high") 478 | func (cfg *config) begin(description string) { 479 | fmt.Printf("%s ...\n", description) 480 | cfg.t0 = time.Now() 481 | cfg.rpcs0 = cfg.rpcTotal() 482 | cfg.cmds0 = 0 483 | cfg.maxIndex0 = cfg.maxIndex 484 | } 485 | 486 | // end a Test -- the fact that we got here means there 487 | // was no failure. 488 | // print the Passed message, 489 | // and some performance numbers. 490 | func (cfg *config) end() { 491 | cfg.checkTimeout() 492 | if cfg.t.Failed() == false { 493 | cfg.mu.Lock() 494 | t := time.Since(cfg.t0).Seconds() // real time 495 | npeers := cfg.n // number of Raft peers 496 | nrpc := cfg.rpcTotal() - cfg.rpcs0 // number of RPC sends 497 | ncmds := cfg.maxIndex - cfg.maxIndex0 // number of Raft agreements reported 498 | cfg.mu.Unlock() 499 | 500 | fmt.Printf(" ... Passed --") 501 | fmt.Printf(" %4.1f %d %4d %4d\n", t, npeers, nrpc, ncmds) 502 | } 503 | } 504 | -------------------------------------------------------------------------------- /src/raft/persister.go: -------------------------------------------------------------------------------- 1 | package raft 2 | 3 | // 4 | // support for Raft and kvraft to save persistent 5 | // Raft state (log &c) and k/v server snapshots. 6 | // 7 | // we will use the original persister.go to test your code for grading. 8 | // so, while you can modify this code to help you debug, please 9 | // test with the original before submitting. 10 | // 11 | 12 | import "sync" 13 | 14 | type Persister struct { 15 | mu sync.Mutex 16 | raftstate []byte 17 | snapshot []byte 18 | } 19 | 20 | func MakePersister() *Persister { 21 | return &Persister{} 22 | } 23 | 24 | func (ps *Persister) Copy() *Persister { 25 | ps.mu.Lock() 26 | defer ps.mu.Unlock() 27 | np := MakePersister() 28 | np.raftstate = ps.raftstate 29 | np.snapshot = ps.snapshot 30 | return np 31 | } 32 | 33 | func (ps *Persister) SaveRaftState(state []byte) { 34 | ps.mu.Lock() 35 | defer ps.mu.Unlock() 36 | ps.raftstate = state 37 | } 38 | 39 | func (ps *Persister) ReadRaftState() []byte { 40 | ps.mu.Lock() 41 | defer ps.mu.Unlock() 42 | return ps.raftstate 43 | } 44 | 45 | func (ps *Persister) RaftStateSize() int { 46 | ps.mu.Lock() 47 | defer ps.mu.Unlock() 48 | return len(ps.raftstate) 49 | } 50 | 51 | // Save both Raft state and K/V snapshot as a single atomic action, 52 | // to help avoid them getting out of sync. 53 | func (ps *Persister) SaveStateAndSnapshot(state []byte, snapshot []byte) { 54 | ps.mu.Lock() 55 | defer ps.mu.Unlock() 56 | ps.raftstate = state 57 | ps.snapshot = snapshot 58 | } 59 | 60 | func (ps *Persister) ReadSnapshot() []byte { 61 | ps.mu.Lock() 62 | defer ps.mu.Unlock() 63 | return ps.snapshot 64 | } 65 | 66 | func (ps *Persister) SnapshotSize() int { 67 | ps.mu.Lock() 68 | defer ps.mu.Unlock() 69 | return len(ps.snapshot) 70 | } 71 | -------------------------------------------------------------------------------- /src/raft/raft.go: -------------------------------------------------------------------------------- 1 | package raft 2 | 3 | // 4 | // this is an outline of the API that raft must expose to 5 | // the service (or tester). see comments below for 6 | // each of these functions for more details. 7 | // 8 | // rf = Make(...) 9 | // create a new Raft server. 10 | // rf.Start(command interface{}) (index, term, isleader) 11 | // start agreement on a new log entry 12 | // rf.GetState() (term, isLeader) 13 | // ask a Raft for its current term, and whether it thinks it is leader 14 | // ApplyMsg 15 | // each time a new entry is committed to the log, each Raft peer 16 | // should send an ApplyMsg to the service (or tester) 17 | // in the same server. 18 | // 19 | 20 | import ( 21 | "fmt" 22 | "math/rand" 23 | "strconv" 24 | "sync" 25 | "time" 26 | ) 27 | import "sync/atomic" 28 | import "Mit6824/src/labrpc" 29 | 30 | // import "bytes" 31 | // import "labgob" 32 | 33 | // 34 | // as each Raft peer becomes aware that successive log entries are 35 | // committed, the peer should send an ApplyMsg to the service (or 36 | // tester) on the same server, via the applyCh passed to Make(). set 37 | // CommandValid to true to indicate that the ApplyMsg contains a newly 38 | // committed log entry. 39 | // 40 | // in Lab 3 you'll want to send other kinds of messages (e.g., 41 | // snapshots) on the applyCh; at that point you can add fields to 42 | // ApplyMsg, but set CommandValid to false for these other uses. 43 | // 44 | type ApplyMsg struct { 45 | CommandValid bool 46 | Command interface{} 47 | CommandIndex int 48 | } 49 | 50 | // 51 | // A Go object implementing a single Raft peer. 52 | // 53 | // TODO: 对raft 各种关键行为的操作 没有保证原子性 54 | type Raft struct { 55 | // TODO: 这里可以该成读写锁 进一步优化 56 | mu sync.Mutex // Lock to protect shared access to this peer's state 57 | peers []*labrpc.ClientEnd // RPC end points of all peers 58 | persister *Persister // Object to hold this peer's persisted state 59 | me int // this peer's index into peers[] 60 | dead int32 // set by Kill() 61 | closeChan chan struct{} // 关闭信号 62 | // Your data here (2A, 2B, 2C). 63 | // Look at the paper's Figure 2 for a description of what 64 | // state a Raft server must maintain. 65 | // 唯一 持久化的配置 66 | roles int32 // 标示当前对等点当前的角色 1 跟随者 2 候选人 3 领导者 67 | id atomic.Value // 对等点唯一标示 68 | curVoteTarget atomic.Value // 当一次投票给node的ID 69 | myTerm int64 // 最后的已知的任期 70 | // TODO: 是否存在 一个并发安全的 list? 71 | logs []Wal // 日志条目 72 | lastApplyIdx int // 最后应用于状态机的日志索引 73 | lastCommitIdx int // 最后的提交日志索引 74 | electionTimer *time.Timer // 用于选举的定时器 75 | } 76 | 77 | func (rf *Raft) initLeader() { 78 | // TODO: 初始化 leader 相关数据状态 79 | rf.setCurVoteTarget(rf.getId()) 80 | } 81 | func (rf *Raft) isLeader() bool { 82 | return atomic.CompareAndSwapInt32(&rf.roles, 3, 3) 83 | } 84 | 85 | // 只有从一个候选人才能变更为领导者 86 | func (rf *Raft) coronation() bool { 87 | return atomic.CompareAndSwapInt32(&rf.roles, 2, 3) 88 | } 89 | 90 | func (rf *Raft) isFollower() bool { 91 | return atomic.LoadInt32(&rf.roles) == 1 92 | } 93 | 94 | func (rf *Raft) following() { 95 | atomic.StoreInt32(&rf.roles, 1) 96 | } 97 | 98 | func (rf *Raft) isCandidate() bool { 99 | return atomic.LoadInt32(&rf.roles) == 2 100 | } 101 | 102 | // 只能从 跟随者 变为候选人 103 | func (rf *Raft) setCandidate() bool { 104 | return atomic.CompareAndSwapInt32(&rf.roles, 1, 2) 105 | } 106 | 107 | func (rf *Raft) getLastCommitIdx() int { 108 | //rf.mu.Lock() 109 | //defer rf.mu.Unlock() 110 | return rf.lastCommitIdx 111 | } 112 | 113 | func (rf *Raft) setLogs(los []Wal) { 114 | //rf.mu.Lock() 115 | //defer rf.mu.Unlock() 116 | rf.logs = los 117 | } 118 | func (rf *Raft) setMyTerm(term int64) { 119 | atomic.StoreInt64(&rf.myTerm, term) 120 | } 121 | 122 | func (rf *Raft) getCurVoteTarget() string { 123 | return rf.curVoteTarget.Load().(string) 124 | } 125 | func (rf *Raft) setCurVoteTarget(vote string) { 126 | rf.curVoteTarget.Store(vote) 127 | } 128 | 129 | func (rf *Raft) setLastCommitIdx(lastCommitIdx int) { 130 | //rf.mu.Lock() 131 | //defer rf.mu.Unlock() 132 | rf.lastCommitIdx = lastCommitIdx 133 | } 134 | 135 | func (rf *Raft) getMyTerm() int64 { 136 | return atomic.LoadInt64(&rf.myTerm) 137 | } 138 | func (rf *Raft) incrMyTerm() int64 { 139 | return atomic.AddInt64(&rf.myTerm, 1) 140 | } 141 | func (rf *Raft) getMe() int { 142 | //rf.mu.Lock() 143 | //defer rf.mu.Unlock() 144 | return rf.me 145 | } 146 | 147 | func (rf *Raft) getId() string { 148 | return rf.id.Load().(string) 149 | } 150 | 151 | func (rf *Raft) setId(id string) { 152 | rf.id.Store(id) 153 | } 154 | 155 | type Wal struct { 156 | term int64 157 | cmd string 158 | } 159 | 160 | // return currentTerm and whether this server 161 | // believes it is the leader. 162 | func (rf *Raft) GetState() (int, bool) { 163 | // Your code here (2A). 164 | return int(rf.getMyTerm()), rf.isLeader() 165 | } 166 | 167 | // 168 | // save Raft's persistent state to stable storage, 169 | // where it can later be retrieved after a crash and restart. 170 | // see paper's Figure 2 for a description of what should be persistent. 171 | // 172 | func (rf *Raft) persist() { 173 | // Your code here (2C). 174 | // Example: 175 | // w := new(bytes.Buffer) 176 | // e := labgob.NewEncoder(w) 177 | // e.Encode(rf.xxx) 178 | // e.Encode(rf.yyy) 179 | // data := w.Bytes() 180 | // rf.persister.SaveRaftState(data) 181 | } 182 | 183 | // 184 | // restore previously persisted state. 185 | // 186 | func (rf *Raft) readPersist(data []byte) { 187 | if data == nil || len(data) < 1 { // bootstrap without any state? 188 | return 189 | } 190 | // Your code here (2C). 191 | // Example: 192 | // r := bytes.NewBuffer(data) 193 | // d := labgob.NewDecoder(r) 194 | // var xxx 195 | // var yyy 196 | // if d.Decode(&xxx) != nil || 197 | // d.Decode(&yyy) != nil { 198 | // error... 199 | // } else { 200 | // rf.xxx = xxx 201 | // rf.yyy = yyy 202 | // } 203 | } 204 | 205 | // 206 | // example RequestVote RPC arguments structure. 207 | // field names must start with capital letters! 208 | // 209 | type RequestVoteArgs struct { 210 | // Your data here (2A, 2B). 211 | CandidateID string 212 | CandidateTerm int64 213 | LastLogIdx int 214 | LastLogTerm int64 215 | } 216 | 217 | // 218 | // example RequestVote RPC reply structure. 219 | // field names must start with capital letters! 220 | // 221 | type RequestVoteReply struct { 222 | // Your data here (2A). 223 | CurTerm int64 224 | IsVote bool 225 | } 226 | 227 | // 228 | // example RequestVote RPC handler. 229 | // 230 | // TODO: 这里可能存在 选票瓜分时 同时竞选成为领导的问题 草你妈的 这一周也没搞定 气死哎呀 231 | func (rf *Raft) RequestVote(args *RequestVoteArgs, reply *RequestVoteReply) { 232 | // Your code here (2A, 2B). 233 | //curVoteTarget := rf.getCurVoteTarget() 234 | rf.mu.Lock() 235 | defer rf.mu.Unlock() 236 | 237 | reply.CurTerm = rf.getMyTerm() 238 | // 如果相同 那么说明同时超时 彼此肯定不投票 选票被瓜分 239 | if args.CandidateTerm <= reply.CurTerm { 240 | reply.IsVote = false 241 | return 242 | } 243 | var lastLog Wal 244 | var lastLogIdx int 245 | // TODO: 对获取最后的日志这里应该进行抽象 246 | if len(rf.logs) > 0 { 247 | lastLog = rf.logs[len(rf.logs)-1] 248 | lastLogIdx = len(rf.logs) - 1 249 | } 250 | //rf.mu.Unlock() 251 | if lastLog.term > args.CandidateTerm /*|| len(curVoteTarget) != 0*/ || lastLogIdx > args.LastLogIdx { 252 | reply.IsVote = false 253 | return 254 | } 255 | rf.setMyTerm(args.CandidateTerm) 256 | rf.setCurVoteTarget(args.CandidateID) 257 | rf.following() 258 | // 在投票后重新等待一个选举超时时间,也就是说 选票会抑制跟随者成为候选者,如果节点投票相当于放弃了最近一次的竞选 259 | fmt.Println(rf.getId(), "在任期", reply.CurTerm, "投票给", args.CandidateID, "后任期变为", rf.getMyTerm()) 260 | rf.electionTimer.Reset(getElectionTimeOut()) 261 | reply.IsVote = true 262 | return 263 | } 264 | 265 | // 266 | // example code to send a RequestVote RPC to a server. 267 | // server is the index of the target server in rf.peers[]. 268 | // expects RPC arguments in args. 269 | // fills in *reply with RPC reply, so caller should 270 | // pass &reply. 271 | // the types of the args and reply passed to Call() must be 272 | // the same as the types of the arguments declared in the 273 | // handler function (including whether they are pointers). 274 | // 275 | // The labrpc package simulates a lossy network, in which servers 276 | // may be unreachable, and in which requests and replies may be lost. 277 | // Call() sends a request and waits for a reply. If a reply arrives 278 | // within a timeout interval, Call() returns true; otherwise 279 | // Call() returns false. Thus Call() may not return for a while. 280 | // A false return can be caused by a dead server, a live server that 281 | // can't be reached, a lost request, or a lost reply. 282 | // 283 | // Call() is guaranteed to return (perhaps after a delay) *except* if the 284 | // handler function on the server side does not return. Thus there 285 | // is no need to implement your own timeouts around Call(). 286 | // 287 | // look at the comments in ../labrpc/labrpc.go for more details. 288 | // 289 | // if you're having trouble getting RPC to work, check that you've 290 | // capitalized all field names in structs passed over RPC, and 291 | // that the caller passes the address of the reply struct with &, not 292 | // the struct itself. 293 | // 294 | func (rf *Raft) sendRequestVote(server int, args *RequestVoteArgs, reply *RequestVoteReply) bool { 295 | ok := rf.peers[server].Call("Raft.RequestVote", args, reply) 296 | return ok 297 | } 298 | 299 | // 300 | // 附加日志/心跳rpc 请求 301 | // 302 | type AppendEntriesArgs struct { 303 | // Your data here (2A, 2B). 304 | LeaderID string 305 | LeaderTerm int64 306 | PreLogIdx int 307 | PreLogTerm int64 308 | LastCommit int 309 | Logs []interface{} 310 | } 311 | 312 | // 313 | // 附加日志/心跳 rpc 的回复 314 | // 315 | type AppendEntriesReply struct { 316 | // Your data here (2A). 317 | CurrTerm int64 318 | IsOK bool 319 | } 320 | 321 | // 322 | // 附加日志/心跳 rpc 的执行体 323 | // 324 | func (rf *Raft) RequestAppendEntries(args *AppendEntriesArgs, reply *AppendEntriesReply) { 325 | // Your code here (2A, 2B). 326 | rf.mu.Lock() 327 | defer rf.mu.Unlock() 328 | myTerm := rf.getMyTerm() 329 | reply.CurrTerm = myTerm 330 | if args.LeaderTerm < myTerm { 331 | reply.IsOK = false 332 | return 333 | } 334 | if len(args.Logs) == 0 && args.LeaderTerm >= myTerm { 335 | // 如果 请求的任期更高 那么就更新自己认为的leader节点 336 | if args.LeaderTerm > myTerm { 337 | rf.following() 338 | rf.setCurVoteTarget(args.LeaderID) 339 | rf.setMyTerm(args.LeaderTerm) 340 | } 341 | reply.IsOK = true 342 | rf.electionTimer.Reset(getElectionTimeOut()) 343 | return 344 | } 345 | } 346 | 347 | // 附加日志/心跳 rpc 的发送函数 348 | func (rf *Raft) sendAppendEntries(server int, args *AppendEntriesArgs, reply *AppendEntriesReply) bool { 349 | rf.mu.Lock() 350 | defer rf.mu.Unlock() 351 | ok := rf.peers[server].Call("Raft.RequestAppendEntries", args, reply) 352 | return ok 353 | } 354 | 355 | // 356 | // the service using Raft (e.g. a k/v server) wants to start 357 | // agreement on the next command to be appended to Raft's log. if this 358 | // server isn't the leader, returns false. otherwise start the 359 | // agreement and return immediately. there is no guarantee that this 360 | // command will ever be committed to the Raft log, since the leader 361 | // may fail or lose an election. even if the Raft instance has been killed, 362 | // this function should return gracefully. 363 | // 364 | // the first return value is the index that the command will appear at 365 | // if it's ever committed. the second return value is the current 366 | // term. the third return value is true if this server believes it is 367 | // the leader. 368 | // 369 | func (rf *Raft) Start(command interface{}) (int, int, bool) { 370 | index := -1 371 | term := -1 372 | isLeader := true 373 | 374 | // Your code here (2B). 375 | 376 | return index, term, isLeader 377 | } 378 | 379 | // 380 | // the tester calls Kill() when a Raft instance won't 381 | // be needed again. for your convenience, we supply 382 | // code to set rf.dead (without needing a lock), 383 | // and a killed() method to test rf.dead in 384 | // long-running loops. you can also add your own 385 | // code to Kill(). you're not required to do anything 386 | // about this, but it may be convenient (for example) 387 | // to suppress debug output from a Kill()ed instance. 388 | // 389 | func (rf *Raft) Kill() { 390 | atomic.StoreInt32(&rf.dead, 1) 391 | // Your code here, if desired. 392 | } 393 | 394 | func (rf *Raft) killed() bool { 395 | z := atomic.LoadInt32(&rf.dead) 396 | return z == 1 397 | } 398 | 399 | // 400 | // the service or tester wants to create a Raft server. the ports 401 | // of all the Raft servers (including this one) are in peers[]. this 402 | // server's port is peers[me]. all the servers' peers[] arrays 403 | // have the same order. persister is a place for this server to 404 | // save its persistent state, and also initially holds the most 405 | // recent saved state, if any. applyCh is a channel on which the 406 | // tester or service expects Raft to send ApplyMsg messages. 407 | // Make() must return quickly, so it should start goroutines 408 | // for any long-running work. 409 | // 410 | func Make(peers []*labrpc.ClientEnd, me int, 411 | persister *Persister, applyCh chan ApplyMsg) *Raft { 412 | rf := &Raft{} 413 | rf.peers = peers 414 | rf.persister = persister 415 | rf.me = me 416 | rf.closeChan = make(chan struct{}) 417 | // Your initialization code here (2A, 2B, 2C). 418 | rf.setId(strconv.Itoa(me)) // TODO: 应该去除特殊字符 419 | rf.setCurVoteTarget(strconv.Itoa(me)) // TODO: 应先从持久化数据中恢复 420 | rf.logs = make([]Wal, 0) 421 | rf.myTerm = 1 // 初始化的时候 大家都认为自己是1,除非 被快照覆盖 422 | rf.following() 423 | go rf.election() 424 | go rf.heartbeat() 425 | // initialize from state persisted before a crash 426 | rf.readPersist(persister.ReadRaftState()) 427 | 428 | return rf 429 | } 430 | func getElectionTimeOut() time.Duration { 431 | rand.Seed(time.Now().UnixNano()) 432 | ts := time.Duration(300+(rand.Int63()%200)) * time.Millisecond 433 | return ts 434 | } 435 | 436 | func (rf *Raft) election() { 437 | rf.electionTimer = time.NewTimer(getElectionTimeOut()) 438 | defer rf.electionTimer.Stop() 439 | for { 440 | select { 441 | case <-rf.electionTimer.C: 442 | if !rf.setCandidate() { //成为候选人 443 | continue 444 | } 445 | rf.incrMyTerm() 446 | rf.setCurVoteTarget("") 447 | myTerm := rf.getMyTerm() 448 | me := rf.me 449 | var lastLogIdx int 450 | var lastLogTerm int64 451 | peersLen := len(rf.peers) 452 | if len(rf.logs) > 0 { 453 | lastLogIdx, lastLogTerm = len(rf.logs)-1, rf.logs[len(rf.logs)-1].term 454 | } 455 | voteArgs, voteRes := RequestVoteArgs{ 456 | CandidateID: rf.getId(), 457 | CandidateTerm: myTerm, 458 | LastLogIdx: lastLogIdx, 459 | LastLogTerm: lastLogTerm, 460 | }, RequestVoteReply{} 461 | res := make([]bool, peersLen) 462 | res[me] = true // 候选人会投自己一票 463 | for i := 0; i < peersLen; i++ { 464 | if i == me { 465 | continue 466 | } 467 | if ok := rf.sendRequestVote(i, &voteArgs, &voteRes); ok { 468 | if voteRes.CurTerm > myTerm { 469 | // 如果 他认为的民众比他任期更高 那么他就回退到跟随者 470 | rf.setMyTerm(voteRes.CurTerm) 471 | rf.following() 472 | goto f 473 | } 474 | res[i] = voteRes.IsVote 475 | } 476 | } 477 | count := 0 478 | for _, v := range res { 479 | if v { 480 | count++ 481 | } 482 | } 483 | if count >= (peersLen)/2+1 { 484 | for !rf.coronation() { 485 | } // CAS 486 | fmt.Println(rf.getId(), "在任期", rf.getMyTerm(), "成为领导者", "获得选票", res, "状态是", rf.roles) 487 | rf.initLeader() 488 | } 489 | case <-rf.closeChan: 490 | return 491 | } 492 | f: 493 | rf.electionTimer.Reset(getElectionTimeOut()) 494 | } 495 | } 496 | 497 | // TODO:expected one leader, got none 无法在 5秒内选举出leader 明天 还是去看看论文吧 这个时候 我怀疑自己的理解有问题了 草 还是不能通过 哭唧唧 498 | func (rf *Raft) heartbeat() { 499 | c := time.NewTicker(100 * time.Millisecond) 500 | for { 501 | select { 502 | case <-c.C: 503 | if rf.isLeader() { 504 | id := rf.getId() 505 | myTerm := rf.getMyTerm() 506 | peersLen := len(rf.peers) 507 | me := rf.me 508 | res := make([]bool, len(rf.peers)) 509 | res[me] = true 510 | for i := 0; i < peersLen; i++ { 511 | if i == me { 512 | continue 513 | } 514 | reply := &AppendEntriesReply{} 515 | ret := rf.sendAppendEntries(i, &AppendEntriesArgs{ 516 | LeaderID: id, 517 | LeaderTerm: myTerm, 518 | Logs: make([]interface{}, 0), 519 | }, reply) 520 | res[i] = ret 521 | if ret && reply.CurrTerm > myTerm { 522 | rf.setMyTerm(reply.CurrTerm) 523 | rf.following() 524 | rf.electionTimer.Reset(getElectionTimeOut()) 525 | break 526 | } 527 | } 528 | count := 0 529 | for _, v := range res { 530 | if v { 531 | count++ 532 | } 533 | } 534 | if count < (peersLen)/2+1 { 535 | rf.following() 536 | rf.electionTimer.Reset(getElectionTimeOut()) 537 | } 538 | } 539 | case <-rf.closeChan: 540 | return 541 | } 542 | } 543 | } 544 | -------------------------------------------------------------------------------- /src/raft/util.go: -------------------------------------------------------------------------------- 1 | package raft 2 | 3 | import "log" 4 | 5 | // Debugging 6 | const Debug = 0 7 | 8 | func DPrintf(format string, a ...interface{}) (n int, err error) { 9 | if Debug > 0 { 10 | log.Printf(format, a...) 11 | } 12 | return 13 | } 14 | -------------------------------------------------------------------------------- /src/shardkv/client.go: -------------------------------------------------------------------------------- 1 | package shardkv 2 | 3 | // 4 | // client code to talk to a sharded key/value service. 5 | // 6 | // the client first talks to the shardmaster to find out 7 | // the assignment of shards (keys) to groups, and then 8 | // talks to the group that holds the key's shard. 9 | // 10 | 11 | import "labrpc" 12 | import "crypto/rand" 13 | import "math/big" 14 | import "shardmaster" 15 | import "time" 16 | 17 | // 18 | // which shard is a key in? 19 | // please use this function, 20 | // and please do not change it. 21 | // 22 | func key2shard(key string) int { 23 | shard := 0 24 | if len(key) > 0 { 25 | shard = int(key[0]) 26 | } 27 | shard %= shardmaster.NShards 28 | return shard 29 | } 30 | 31 | func nrand() int64 { 32 | max := big.NewInt(int64(1) << 62) 33 | bigx, _ := rand.Int(rand.Reader, max) 34 | x := bigx.Int64() 35 | return x 36 | } 37 | 38 | type Clerk struct { 39 | sm *shardmaster.Clerk 40 | config shardmaster.Config 41 | make_end func(string) *labrpc.ClientEnd 42 | // You will have to modify this struct. 43 | } 44 | 45 | // 46 | // the tester calls MakeClerk. 47 | // 48 | // masters[] is needed to call shardmaster.MakeClerk(). 49 | // 50 | // make_end(servername) turns a server name from a 51 | // Config.Groups[gid][i] into a labrpc.ClientEnd on which you can 52 | // send RPCs. 53 | // 54 | func MakeClerk(masters []*labrpc.ClientEnd, make_end func(string) *labrpc.ClientEnd) *Clerk { 55 | ck := new(Clerk) 56 | ck.sm = shardmaster.MakeClerk(masters) 57 | ck.make_end = make_end 58 | // You'll have to add code here. 59 | return ck 60 | } 61 | 62 | // 63 | // fetch the current value for a key. 64 | // returns "" if the key does not exist. 65 | // keeps trying forever in the face of all other errors. 66 | // You will have to modify this function. 67 | // 68 | func (ck *Clerk) Get(key string) string { 69 | args := GetArgs{} 70 | args.Key = key 71 | 72 | for { 73 | shard := key2shard(key) 74 | gid := ck.config.Shards[shard] 75 | if servers, ok := ck.config.Groups[gid]; ok { 76 | // try each server for the shard. 77 | for si := 0; si < len(servers); si++ { 78 | srv := ck.make_end(servers[si]) 79 | var reply GetReply 80 | ok := srv.Call("ShardKV.Get", &args, &reply) 81 | if ok && (reply.Err == OK || reply.Err == ErrNoKey) { 82 | return reply.Value 83 | } 84 | if ok && (reply.Err == ErrWrongGroup) { 85 | break 86 | } 87 | // ... not ok, or ErrWrongLeader 88 | } 89 | } 90 | time.Sleep(100 * time.Millisecond) 91 | // ask master for the latest configuration. 92 | ck.config = ck.sm.Query(-1) 93 | } 94 | 95 | return "" 96 | } 97 | 98 | // 99 | // shared by Put and Append. 100 | // You will have to modify this function. 101 | // 102 | func (ck *Clerk) PutAppend(key string, value string, op string) { 103 | args := PutAppendArgs{} 104 | args.Key = key 105 | args.Value = value 106 | args.Op = op 107 | 108 | for { 109 | shard := key2shard(key) 110 | gid := ck.config.Shards[shard] 111 | if servers, ok := ck.config.Groups[gid]; ok { 112 | for si := 0; si < len(servers); si++ { 113 | srv := ck.make_end(servers[si]) 114 | var reply PutAppendReply 115 | ok := srv.Call("ShardKV.PutAppend", &args, &reply) 116 | if ok && reply.Err == OK { 117 | return 118 | } 119 | if ok && reply.Err == ErrWrongGroup { 120 | break 121 | } 122 | // ... not ok, or ErrWrongLeader 123 | } 124 | } 125 | time.Sleep(100 * time.Millisecond) 126 | // ask master for the latest configuration. 127 | ck.config = ck.sm.Query(-1) 128 | } 129 | } 130 | 131 | func (ck *Clerk) Put(key string, value string) { 132 | ck.PutAppend(key, value, "Put") 133 | } 134 | func (ck *Clerk) Append(key string, value string) { 135 | ck.PutAppend(key, value, "Append") 136 | } 137 | -------------------------------------------------------------------------------- /src/shardkv/common.go: -------------------------------------------------------------------------------- 1 | package shardkv 2 | 3 | // 4 | // Sharded key/value server. 5 | // Lots of replica groups, each running op-at-a-time paxos. 6 | // Shardmaster decides which group serves each shard. 7 | // Shardmaster may change shard assignment from time to time. 8 | // 9 | // You will have to modify these definitions. 10 | // 11 | 12 | const ( 13 | OK = "OK" 14 | ErrNoKey = "ErrNoKey" 15 | ErrWrongGroup = "ErrWrongGroup" 16 | ErrWrongLeader = "ErrWrongLeader" 17 | ) 18 | 19 | type Err string 20 | 21 | // Put or Append 22 | type PutAppendArgs struct { 23 | // You'll have to add definitions here. 24 | Key string 25 | Value string 26 | Op string // "Put" or "Append" 27 | // You'll have to add definitions here. 28 | // Field names must start with capital letters, 29 | // otherwise RPC will break. 30 | } 31 | 32 | type PutAppendReply struct { 33 | Err Err 34 | } 35 | 36 | type GetArgs struct { 37 | Key string 38 | // You'll have to add definitions here. 39 | } 40 | 41 | type GetReply struct { 42 | Err Err 43 | Value string 44 | } 45 | -------------------------------------------------------------------------------- /src/shardkv/config.go: -------------------------------------------------------------------------------- 1 | package shardkv 2 | 3 | import "shardmaster" 4 | import "labrpc" 5 | import "testing" 6 | import "os" 7 | 8 | // import "log" 9 | import crand "crypto/rand" 10 | import "math/big" 11 | import "math/rand" 12 | import "encoding/base64" 13 | import "sync" 14 | import "runtime" 15 | import "raft" 16 | import "strconv" 17 | import "fmt" 18 | import "time" 19 | 20 | func randstring(n int) string { 21 | b := make([]byte, 2*n) 22 | crand.Read(b) 23 | s := base64.URLEncoding.EncodeToString(b) 24 | return s[0:n] 25 | } 26 | 27 | func makeSeed() int64 { 28 | max := big.NewInt(int64(1) << 62) 29 | bigx, _ := crand.Int(crand.Reader, max) 30 | x := bigx.Int64() 31 | return x 32 | } 33 | 34 | // Randomize server handles 35 | func random_handles(kvh []*labrpc.ClientEnd) []*labrpc.ClientEnd { 36 | sa := make([]*labrpc.ClientEnd, len(kvh)) 37 | copy(sa, kvh) 38 | for i := range sa { 39 | j := rand.Intn(i + 1) 40 | sa[i], sa[j] = sa[j], sa[i] 41 | } 42 | return sa 43 | } 44 | 45 | type group struct { 46 | gid int 47 | servers []*ShardKV 48 | saved []*raft.Persister 49 | endnames [][]string 50 | mendnames [][]string 51 | } 52 | 53 | type config struct { 54 | mu sync.Mutex 55 | t *testing.T 56 | net *labrpc.Network 57 | start time.Time // time at which make_config() was called 58 | 59 | nmasters int 60 | masterservers []*shardmaster.ShardMaster 61 | mck *shardmaster.Clerk 62 | 63 | ngroups int 64 | n int // servers per k/v group 65 | groups []*group 66 | 67 | clerks map[*Clerk][]string 68 | nextClientId int 69 | maxraftstate int 70 | } 71 | 72 | func (cfg *config) checkTimeout() { 73 | // enforce a two minute real-time limit on each test 74 | if !cfg.t.Failed() && time.Since(cfg.start) > 120*time.Second { 75 | cfg.t.Fatal("test took longer than 120 seconds") 76 | } 77 | } 78 | 79 | func (cfg *config) cleanup() { 80 | for gi := 0; gi < cfg.ngroups; gi++ { 81 | cfg.ShutdownGroup(gi) 82 | } 83 | cfg.net.Cleanup() 84 | cfg.checkTimeout() 85 | } 86 | 87 | // check that no server's log is too big. 88 | func (cfg *config) checklogs() { 89 | for gi := 0; gi < cfg.ngroups; gi++ { 90 | for i := 0; i < cfg.n; i++ { 91 | raft := cfg.groups[gi].saved[i].RaftStateSize() 92 | snap := len(cfg.groups[gi].saved[i].ReadSnapshot()) 93 | if cfg.maxraftstate >= 0 && raft > 2*cfg.maxraftstate { 94 | cfg.t.Fatalf("persister.RaftStateSize() %v, but maxraftstate %v", 95 | raft, cfg.maxraftstate) 96 | } 97 | if cfg.maxraftstate < 0 && snap > 0 { 98 | cfg.t.Fatalf("maxraftstate is -1, but snapshot is non-empty!") 99 | } 100 | } 101 | } 102 | } 103 | 104 | // master server name for labrpc. 105 | func (cfg *config) mastername(i int) string { 106 | return "master" + strconv.Itoa(i) 107 | } 108 | 109 | // shard server name for labrpc. 110 | // i'th server of group gid. 111 | func (cfg *config) servername(gid int, i int) string { 112 | return "server-" + strconv.Itoa(gid) + "-" + strconv.Itoa(i) 113 | } 114 | 115 | func (cfg *config) makeClient() *Clerk { 116 | cfg.mu.Lock() 117 | defer cfg.mu.Unlock() 118 | 119 | // ClientEnds to talk to master service. 120 | ends := make([]*labrpc.ClientEnd, cfg.nmasters) 121 | endnames := make([]string, cfg.n) 122 | for j := 0; j < cfg.nmasters; j++ { 123 | endnames[j] = randstring(20) 124 | ends[j] = cfg.net.MakeEnd(endnames[j]) 125 | cfg.net.Connect(endnames[j], cfg.mastername(j)) 126 | cfg.net.Enable(endnames[j], true) 127 | } 128 | 129 | ck := MakeClerk(ends, func(servername string) *labrpc.ClientEnd { 130 | name := randstring(20) 131 | end := cfg.net.MakeEnd(name) 132 | cfg.net.Connect(name, servername) 133 | cfg.net.Enable(name, true) 134 | return end 135 | }) 136 | cfg.clerks[ck] = endnames 137 | cfg.nextClientId++ 138 | return ck 139 | } 140 | 141 | func (cfg *config) deleteClient(ck *Clerk) { 142 | cfg.mu.Lock() 143 | defer cfg.mu.Unlock() 144 | 145 | v := cfg.clerks[ck] 146 | for i := 0; i < len(v); i++ { 147 | os.Remove(v[i]) 148 | } 149 | delete(cfg.clerks, ck) 150 | } 151 | 152 | // Shutdown i'th server of gi'th group, by isolating it 153 | func (cfg *config) ShutdownServer(gi int, i int) { 154 | cfg.mu.Lock() 155 | defer cfg.mu.Unlock() 156 | 157 | gg := cfg.groups[gi] 158 | 159 | // prevent this server from sending 160 | for j := 0; j < len(gg.servers); j++ { 161 | name := gg.endnames[i][j] 162 | cfg.net.Enable(name, false) 163 | } 164 | for j := 0; j < len(gg.mendnames[i]); j++ { 165 | name := gg.mendnames[i][j] 166 | cfg.net.Enable(name, false) 167 | } 168 | 169 | // disable client connections to the server. 170 | // it's important to do this before creating 171 | // the new Persister in saved[i], to avoid 172 | // the possibility of the server returning a 173 | // positive reply to an Append but persisting 174 | // the result in the superseded Persister. 175 | cfg.net.DeleteServer(cfg.servername(gg.gid, i)) 176 | 177 | // a fresh persister, in case old instance 178 | // continues to update the Persister. 179 | // but copy old persister's content so that we always 180 | // pass Make() the last persisted state. 181 | if gg.saved[i] != nil { 182 | gg.saved[i] = gg.saved[i].Copy() 183 | } 184 | 185 | kv := gg.servers[i] 186 | if kv != nil { 187 | cfg.mu.Unlock() 188 | kv.Kill() 189 | cfg.mu.Lock() 190 | gg.servers[i] = nil 191 | } 192 | } 193 | 194 | func (cfg *config) ShutdownGroup(gi int) { 195 | for i := 0; i < cfg.n; i++ { 196 | cfg.ShutdownServer(gi, i) 197 | } 198 | } 199 | 200 | // start i'th server in gi'th group 201 | func (cfg *config) StartServer(gi int, i int) { 202 | cfg.mu.Lock() 203 | 204 | gg := cfg.groups[gi] 205 | 206 | // a fresh set of outgoing ClientEnd names 207 | // to talk to other servers in this group. 208 | gg.endnames[i] = make([]string, cfg.n) 209 | for j := 0; j < cfg.n; j++ { 210 | gg.endnames[i][j] = randstring(20) 211 | } 212 | 213 | // and the connections to other servers in this group. 214 | ends := make([]*labrpc.ClientEnd, cfg.n) 215 | for j := 0; j < cfg.n; j++ { 216 | ends[j] = cfg.net.MakeEnd(gg.endnames[i][j]) 217 | cfg.net.Connect(gg.endnames[i][j], cfg.servername(gg.gid, j)) 218 | cfg.net.Enable(gg.endnames[i][j], true) 219 | } 220 | 221 | // ends to talk to shardmaster service 222 | mends := make([]*labrpc.ClientEnd, cfg.nmasters) 223 | gg.mendnames[i] = make([]string, cfg.nmasters) 224 | for j := 0; j < cfg.nmasters; j++ { 225 | gg.mendnames[i][j] = randstring(20) 226 | mends[j] = cfg.net.MakeEnd(gg.mendnames[i][j]) 227 | cfg.net.Connect(gg.mendnames[i][j], cfg.mastername(j)) 228 | cfg.net.Enable(gg.mendnames[i][j], true) 229 | } 230 | 231 | // a fresh persister, so old instance doesn't overwrite 232 | // new instance's persisted state. 233 | // give the fresh persister a copy of the old persister's 234 | // state, so that the spec is that we pass StartKVServer() 235 | // the last persisted state. 236 | if gg.saved[i] != nil { 237 | gg.saved[i] = gg.saved[i].Copy() 238 | } else { 239 | gg.saved[i] = raft.MakePersister() 240 | } 241 | cfg.mu.Unlock() 242 | 243 | gg.servers[i] = StartServer(ends, i, gg.saved[i], cfg.maxraftstate, 244 | gg.gid, mends, 245 | func(servername string) *labrpc.ClientEnd { 246 | name := randstring(20) 247 | end := cfg.net.MakeEnd(name) 248 | cfg.net.Connect(name, servername) 249 | cfg.net.Enable(name, true) 250 | return end 251 | }) 252 | 253 | kvsvc := labrpc.MakeService(gg.servers[i]) 254 | rfsvc := labrpc.MakeService(gg.servers[i].rf) 255 | srv := labrpc.MakeServer() 256 | srv.AddService(kvsvc) 257 | srv.AddService(rfsvc) 258 | cfg.net.AddServer(cfg.servername(gg.gid, i), srv) 259 | } 260 | 261 | func (cfg *config) StartGroup(gi int) { 262 | for i := 0; i < cfg.n; i++ { 263 | cfg.StartServer(gi, i) 264 | } 265 | } 266 | 267 | func (cfg *config) StartMasterServer(i int) { 268 | // ClientEnds to talk to other master replicas. 269 | ends := make([]*labrpc.ClientEnd, cfg.nmasters) 270 | for j := 0; j < cfg.nmasters; j++ { 271 | endname := randstring(20) 272 | ends[j] = cfg.net.MakeEnd(endname) 273 | cfg.net.Connect(endname, cfg.mastername(j)) 274 | cfg.net.Enable(endname, true) 275 | } 276 | 277 | p := raft.MakePersister() 278 | 279 | cfg.masterservers[i] = shardmaster.StartServer(ends, i, p) 280 | 281 | msvc := labrpc.MakeService(cfg.masterservers[i]) 282 | rfsvc := labrpc.MakeService(cfg.masterservers[i].Raft()) 283 | srv := labrpc.MakeServer() 284 | srv.AddService(msvc) 285 | srv.AddService(rfsvc) 286 | cfg.net.AddServer(cfg.mastername(i), srv) 287 | } 288 | 289 | func (cfg *config) shardclerk() *shardmaster.Clerk { 290 | // ClientEnds to talk to master service. 291 | ends := make([]*labrpc.ClientEnd, cfg.nmasters) 292 | for j := 0; j < cfg.nmasters; j++ { 293 | name := randstring(20) 294 | ends[j] = cfg.net.MakeEnd(name) 295 | cfg.net.Connect(name, cfg.mastername(j)) 296 | cfg.net.Enable(name, true) 297 | } 298 | 299 | return shardmaster.MakeClerk(ends) 300 | } 301 | 302 | // tell the shardmaster that a group is joining. 303 | func (cfg *config) join(gi int) { 304 | cfg.joinm([]int{gi}) 305 | } 306 | 307 | func (cfg *config) joinm(gis []int) { 308 | m := make(map[int][]string, len(gis)) 309 | for _, g := range gis { 310 | gid := cfg.groups[g].gid 311 | servernames := make([]string, cfg.n) 312 | for i := 0; i < cfg.n; i++ { 313 | servernames[i] = cfg.servername(gid, i) 314 | } 315 | m[gid] = servernames 316 | } 317 | cfg.mck.Join(m) 318 | } 319 | 320 | // tell the shardmaster that a group is leaving. 321 | func (cfg *config) leave(gi int) { 322 | cfg.leavem([]int{gi}) 323 | } 324 | 325 | func (cfg *config) leavem(gis []int) { 326 | gids := make([]int, 0, len(gis)) 327 | for _, g := range gis { 328 | gids = append(gids, cfg.groups[g].gid) 329 | } 330 | cfg.mck.Leave(gids) 331 | } 332 | 333 | var ncpu_once sync.Once 334 | 335 | func make_config(t *testing.T, n int, unreliable bool, maxraftstate int) *config { 336 | ncpu_once.Do(func() { 337 | if runtime.NumCPU() < 2 { 338 | fmt.Printf("warning: only one CPU, which may conceal locking bugs\n") 339 | } 340 | rand.Seed(makeSeed()) 341 | }) 342 | runtime.GOMAXPROCS(4) 343 | cfg := &config{} 344 | cfg.t = t 345 | cfg.maxraftstate = maxraftstate 346 | cfg.net = labrpc.MakeNetwork() 347 | cfg.start = time.Now() 348 | 349 | // master 350 | cfg.nmasters = 3 351 | cfg.masterservers = make([]*shardmaster.ShardMaster, cfg.nmasters) 352 | for i := 0; i < cfg.nmasters; i++ { 353 | cfg.StartMasterServer(i) 354 | } 355 | cfg.mck = cfg.shardclerk() 356 | 357 | cfg.ngroups = 3 358 | cfg.groups = make([]*group, cfg.ngroups) 359 | cfg.n = n 360 | for gi := 0; gi < cfg.ngroups; gi++ { 361 | gg := &group{} 362 | cfg.groups[gi] = gg 363 | gg.gid = 100 + gi 364 | gg.servers = make([]*ShardKV, cfg.n) 365 | gg.saved = make([]*raft.Persister, cfg.n) 366 | gg.endnames = make([][]string, cfg.n) 367 | gg.mendnames = make([][]string, cfg.nmasters) 368 | for i := 0; i < cfg.n; i++ { 369 | cfg.StartServer(gi, i) 370 | } 371 | } 372 | 373 | cfg.clerks = make(map[*Clerk][]string) 374 | cfg.nextClientId = cfg.n + 1000 // client ids start 1000 above the highest serverid 375 | 376 | cfg.net.Reliable(!unreliable) 377 | 378 | return cfg 379 | } 380 | -------------------------------------------------------------------------------- /src/shardkv/server.go: -------------------------------------------------------------------------------- 1 | package shardkv 2 | 3 | // import "shardmaster" 4 | import "labrpc" 5 | import "raft" 6 | import "sync" 7 | import "labgob" 8 | 9 | type Op struct { 10 | // Your definitions here. 11 | // Field names must start with capital letters, 12 | // otherwise RPC will break. 13 | } 14 | 15 | type ShardKV struct { 16 | mu sync.Mutex 17 | me int 18 | rf *raft.Raft 19 | applyCh chan raft.ApplyMsg 20 | make_end func(string) *labrpc.ClientEnd 21 | gid int 22 | masters []*labrpc.ClientEnd 23 | maxraftstate int // snapshot if log grows this big 24 | 25 | // Your definitions here. 26 | } 27 | 28 | func (kv *ShardKV) Get(args *GetArgs, reply *GetReply) { 29 | // Your code here. 30 | } 31 | 32 | func (kv *ShardKV) PutAppend(args *PutAppendArgs, reply *PutAppendReply) { 33 | // Your code here. 34 | } 35 | 36 | // 37 | // the tester calls Kill() when a ShardKV instance won't 38 | // be needed again. you are not required to do anything 39 | // in Kill(), but it might be convenient to (for example) 40 | // turn off debug output from this instance. 41 | // 42 | func (kv *ShardKV) Kill() { 43 | kv.rf.Kill() 44 | // Your code here, if desired. 45 | } 46 | 47 | // 48 | // servers[] contains the ports of the servers in this group. 49 | // 50 | // me is the index of the current server in servers[]. 51 | // 52 | // the k/v server should store snapshots through the underlying Raft 53 | // implementation, which should call persister.SaveStateAndSnapshot() to 54 | // atomically save the Raft state along with the snapshot. 55 | // 56 | // the k/v server should snapshot when Raft's saved state exceeds 57 | // maxraftstate bytes, in order to allow Raft to garbage-collect its 58 | // log. if maxraftstate is -1, you don't need to snapshot. 59 | // 60 | // gid is this group's GID, for interacting with the shardmaster. 61 | // 62 | // pass masters[] to shardmaster.MakeClerk() so you can send 63 | // RPCs to the shardmaster. 64 | // 65 | // make_end(servername) turns a server name from a 66 | // Config.Groups[gid][i] into a labrpc.ClientEnd on which you can 67 | // send RPCs. You'll need this to send RPCs to other groups. 68 | // 69 | // look at client.go for examples of how to use masters[] 70 | // and make_end() to send RPCs to the group owning a specific shard. 71 | // 72 | // StartServer() must return quickly, so it should start goroutines 73 | // for any long-running work. 74 | // 75 | func StartServer(servers []*labrpc.ClientEnd, me int, persister *raft.Persister, maxraftstate int, gid int, masters []*labrpc.ClientEnd, make_end func(string) *labrpc.ClientEnd) *ShardKV { 76 | // call labgob.Register on structures you want 77 | // Go's RPC library to marshall/unmarshall. 78 | labgob.Register(Op{}) 79 | 80 | kv := new(ShardKV) 81 | kv.me = me 82 | kv.maxraftstate = maxraftstate 83 | kv.make_end = make_end 84 | kv.gid = gid 85 | kv.masters = masters 86 | 87 | // Your initialization code here. 88 | 89 | // Use something like this to talk to the shardmaster: 90 | // kv.mck = shardmaster.MakeClerk(kv.masters) 91 | 92 | kv.applyCh = make(chan raft.ApplyMsg) 93 | kv.rf = raft.Make(servers, me, persister, kv.applyCh) 94 | 95 | return kv 96 | } 97 | -------------------------------------------------------------------------------- /src/shardkv/test_test.go: -------------------------------------------------------------------------------- 1 | package shardkv 2 | 3 | import "linearizability" 4 | 5 | import "testing" 6 | import "strconv" 7 | import "time" 8 | import "fmt" 9 | import "sync/atomic" 10 | import "sync" 11 | import "math/rand" 12 | 13 | const linearizabilityCheckTimeout = 1 * time.Second 14 | 15 | func check(t *testing.T, ck *Clerk, key string, value string) { 16 | v := ck.Get(key) 17 | if v != value { 18 | t.Fatalf("Get(%v): expected:\n%v\nreceived:\n%v", key, value, v) 19 | } 20 | } 21 | 22 | // 23 | // test static 2-way sharding, without shard movement. 24 | // 25 | func TestStaticShards(t *testing.T) { 26 | fmt.Printf("Test: static shards ...\n") 27 | 28 | cfg := make_config(t, 3, false, -1) 29 | defer cfg.cleanup() 30 | 31 | ck := cfg.makeClient() 32 | 33 | cfg.join(0) 34 | cfg.join(1) 35 | 36 | n := 10 37 | ka := make([]string, n) 38 | va := make([]string, n) 39 | for i := 0; i < n; i++ { 40 | ka[i] = strconv.Itoa(i) // ensure multiple shards 41 | va[i] = randstring(20) 42 | ck.Put(ka[i], va[i]) 43 | } 44 | for i := 0; i < n; i++ { 45 | check(t, ck, ka[i], va[i]) 46 | } 47 | 48 | // make sure that the data really is sharded by 49 | // shutting down one shard and checking that some 50 | // Get()s don't succeed. 51 | cfg.ShutdownGroup(1) 52 | cfg.checklogs() // forbid snapshots 53 | 54 | ch := make(chan bool) 55 | for xi := 0; xi < n; xi++ { 56 | ck1 := cfg.makeClient() // only one call allowed per client 57 | go func(i int) { 58 | defer func() { ch <- true }() 59 | check(t, ck1, ka[i], va[i]) 60 | }(xi) 61 | } 62 | 63 | // wait a bit, only about half the Gets should succeed. 64 | ndone := 0 65 | done := false 66 | for done == false { 67 | select { 68 | case <-ch: 69 | ndone += 1 70 | case <-time.After(time.Second * 2): 71 | done = true 72 | break 73 | } 74 | } 75 | 76 | if ndone != 5 { 77 | t.Fatalf("expected 5 completions with one shard dead; got %v\n", ndone) 78 | } 79 | 80 | // bring the crashed shard/group back to life. 81 | cfg.StartGroup(1) 82 | for i := 0; i < n; i++ { 83 | check(t, ck, ka[i], va[i]) 84 | } 85 | 86 | fmt.Printf(" ... Passed\n") 87 | } 88 | 89 | func TestJoinLeave(t *testing.T) { 90 | fmt.Printf("Test: join then leave ...\n") 91 | 92 | cfg := make_config(t, 3, false, -1) 93 | defer cfg.cleanup() 94 | 95 | ck := cfg.makeClient() 96 | 97 | cfg.join(0) 98 | 99 | n := 10 100 | ka := make([]string, n) 101 | va := make([]string, n) 102 | for i := 0; i < n; i++ { 103 | ka[i] = strconv.Itoa(i) // ensure multiple shards 104 | va[i] = randstring(5) 105 | ck.Put(ka[i], va[i]) 106 | } 107 | for i := 0; i < n; i++ { 108 | check(t, ck, ka[i], va[i]) 109 | } 110 | 111 | cfg.join(1) 112 | 113 | for i := 0; i < n; i++ { 114 | check(t, ck, ka[i], va[i]) 115 | x := randstring(5) 116 | ck.Append(ka[i], x) 117 | va[i] += x 118 | } 119 | 120 | cfg.leave(0) 121 | 122 | for i := 0; i < n; i++ { 123 | check(t, ck, ka[i], va[i]) 124 | x := randstring(5) 125 | ck.Append(ka[i], x) 126 | va[i] += x 127 | } 128 | 129 | // allow time for shards to transfer. 130 | time.Sleep(1 * time.Second) 131 | 132 | cfg.checklogs() 133 | cfg.ShutdownGroup(0) 134 | 135 | for i := 0; i < n; i++ { 136 | check(t, ck, ka[i], va[i]) 137 | } 138 | 139 | fmt.Printf(" ... Passed\n") 140 | } 141 | 142 | func TestSnapshot(t *testing.T) { 143 | fmt.Printf("Test: snapshots, join, and leave ...\n") 144 | 145 | cfg := make_config(t, 3, false, 1000) 146 | defer cfg.cleanup() 147 | 148 | ck := cfg.makeClient() 149 | 150 | cfg.join(0) 151 | 152 | n := 30 153 | ka := make([]string, n) 154 | va := make([]string, n) 155 | for i := 0; i < n; i++ { 156 | ka[i] = strconv.Itoa(i) // ensure multiple shards 157 | va[i] = randstring(20) 158 | ck.Put(ka[i], va[i]) 159 | } 160 | for i := 0; i < n; i++ { 161 | check(t, ck, ka[i], va[i]) 162 | } 163 | 164 | cfg.join(1) 165 | cfg.join(2) 166 | cfg.leave(0) 167 | 168 | for i := 0; i < n; i++ { 169 | check(t, ck, ka[i], va[i]) 170 | x := randstring(20) 171 | ck.Append(ka[i], x) 172 | va[i] += x 173 | } 174 | 175 | cfg.leave(1) 176 | cfg.join(0) 177 | 178 | for i := 0; i < n; i++ { 179 | check(t, ck, ka[i], va[i]) 180 | x := randstring(20) 181 | ck.Append(ka[i], x) 182 | va[i] += x 183 | } 184 | 185 | time.Sleep(1 * time.Second) 186 | 187 | for i := 0; i < n; i++ { 188 | check(t, ck, ka[i], va[i]) 189 | } 190 | 191 | time.Sleep(1 * time.Second) 192 | 193 | cfg.checklogs() 194 | 195 | cfg.ShutdownGroup(0) 196 | cfg.ShutdownGroup(1) 197 | cfg.ShutdownGroup(2) 198 | 199 | cfg.StartGroup(0) 200 | cfg.StartGroup(1) 201 | cfg.StartGroup(2) 202 | 203 | for i := 0; i < n; i++ { 204 | check(t, ck, ka[i], va[i]) 205 | } 206 | 207 | fmt.Printf(" ... Passed\n") 208 | } 209 | 210 | func TestMissChange(t *testing.T) { 211 | fmt.Printf("Test: servers miss configuration changes...\n") 212 | 213 | cfg := make_config(t, 3, false, 1000) 214 | defer cfg.cleanup() 215 | 216 | ck := cfg.makeClient() 217 | 218 | cfg.join(0) 219 | 220 | n := 10 221 | ka := make([]string, n) 222 | va := make([]string, n) 223 | for i := 0; i < n; i++ { 224 | ka[i] = strconv.Itoa(i) // ensure multiple shards 225 | va[i] = randstring(20) 226 | ck.Put(ka[i], va[i]) 227 | } 228 | for i := 0; i < n; i++ { 229 | check(t, ck, ka[i], va[i]) 230 | } 231 | 232 | cfg.join(1) 233 | 234 | cfg.ShutdownServer(0, 0) 235 | cfg.ShutdownServer(1, 0) 236 | cfg.ShutdownServer(2, 0) 237 | 238 | cfg.join(2) 239 | cfg.leave(1) 240 | cfg.leave(0) 241 | 242 | for i := 0; i < n; i++ { 243 | check(t, ck, ka[i], va[i]) 244 | x := randstring(20) 245 | ck.Append(ka[i], x) 246 | va[i] += x 247 | } 248 | 249 | cfg.join(1) 250 | 251 | for i := 0; i < n; i++ { 252 | check(t, ck, ka[i], va[i]) 253 | x := randstring(20) 254 | ck.Append(ka[i], x) 255 | va[i] += x 256 | } 257 | 258 | cfg.StartServer(0, 0) 259 | cfg.StartServer(1, 0) 260 | cfg.StartServer(2, 0) 261 | 262 | for i := 0; i < n; i++ { 263 | check(t, ck, ka[i], va[i]) 264 | x := randstring(20) 265 | ck.Append(ka[i], x) 266 | va[i] += x 267 | } 268 | 269 | time.Sleep(2 * time.Second) 270 | 271 | cfg.ShutdownServer(0, 1) 272 | cfg.ShutdownServer(1, 1) 273 | cfg.ShutdownServer(2, 1) 274 | 275 | cfg.join(0) 276 | cfg.leave(2) 277 | 278 | for i := 0; i < n; i++ { 279 | check(t, ck, ka[i], va[i]) 280 | x := randstring(20) 281 | ck.Append(ka[i], x) 282 | va[i] += x 283 | } 284 | 285 | cfg.StartServer(0, 1) 286 | cfg.StartServer(1, 1) 287 | cfg.StartServer(2, 1) 288 | 289 | for i := 0; i < n; i++ { 290 | check(t, ck, ka[i], va[i]) 291 | } 292 | 293 | fmt.Printf(" ... Passed\n") 294 | } 295 | 296 | func TestConcurrent1(t *testing.T) { 297 | fmt.Printf("Test: concurrent puts and configuration changes...\n") 298 | 299 | cfg := make_config(t, 3, false, 100) 300 | defer cfg.cleanup() 301 | 302 | ck := cfg.makeClient() 303 | 304 | cfg.join(0) 305 | 306 | n := 10 307 | ka := make([]string, n) 308 | va := make([]string, n) 309 | for i := 0; i < n; i++ { 310 | ka[i] = strconv.Itoa(i) // ensure multiple shards 311 | va[i] = randstring(5) 312 | ck.Put(ka[i], va[i]) 313 | } 314 | 315 | var done int32 316 | ch := make(chan bool) 317 | 318 | ff := func(i int) { 319 | defer func() { ch <- true }() 320 | ck1 := cfg.makeClient() 321 | for atomic.LoadInt32(&done) == 0 { 322 | x := randstring(5) 323 | ck1.Append(ka[i], x) 324 | va[i] += x 325 | time.Sleep(10 * time.Millisecond) 326 | } 327 | } 328 | 329 | for i := 0; i < n; i++ { 330 | go ff(i) 331 | } 332 | 333 | time.Sleep(150 * time.Millisecond) 334 | cfg.join(1) 335 | time.Sleep(500 * time.Millisecond) 336 | cfg.join(2) 337 | time.Sleep(500 * time.Millisecond) 338 | cfg.leave(0) 339 | 340 | cfg.ShutdownGroup(0) 341 | time.Sleep(100 * time.Millisecond) 342 | cfg.ShutdownGroup(1) 343 | time.Sleep(100 * time.Millisecond) 344 | cfg.ShutdownGroup(2) 345 | 346 | cfg.leave(2) 347 | 348 | time.Sleep(100 * time.Millisecond) 349 | cfg.StartGroup(0) 350 | cfg.StartGroup(1) 351 | cfg.StartGroup(2) 352 | 353 | time.Sleep(100 * time.Millisecond) 354 | cfg.join(0) 355 | cfg.leave(1) 356 | time.Sleep(500 * time.Millisecond) 357 | cfg.join(1) 358 | 359 | time.Sleep(1 * time.Second) 360 | 361 | atomic.StoreInt32(&done, 1) 362 | for i := 0; i < n; i++ { 363 | <-ch 364 | } 365 | 366 | for i := 0; i < n; i++ { 367 | check(t, ck, ka[i], va[i]) 368 | } 369 | 370 | fmt.Printf(" ... Passed\n") 371 | } 372 | 373 | // 374 | // this tests the various sources from which a re-starting 375 | // group might need to fetch shard contents. 376 | // 377 | func TestConcurrent2(t *testing.T) { 378 | fmt.Printf("Test: more concurrent puts and configuration changes...\n") 379 | 380 | cfg := make_config(t, 3, false, -1) 381 | defer cfg.cleanup() 382 | 383 | ck := cfg.makeClient() 384 | 385 | cfg.join(1) 386 | cfg.join(0) 387 | cfg.join(2) 388 | 389 | n := 10 390 | ka := make([]string, n) 391 | va := make([]string, n) 392 | for i := 0; i < n; i++ { 393 | ka[i] = strconv.Itoa(i) // ensure multiple shards 394 | va[i] = randstring(1) 395 | ck.Put(ka[i], va[i]) 396 | } 397 | 398 | var done int32 399 | ch := make(chan bool) 400 | 401 | ff := func(i int, ck1 *Clerk) { 402 | defer func() { ch <- true }() 403 | for atomic.LoadInt32(&done) == 0 { 404 | x := randstring(1) 405 | ck1.Append(ka[i], x) 406 | va[i] += x 407 | time.Sleep(50 * time.Millisecond) 408 | } 409 | } 410 | 411 | for i := 0; i < n; i++ { 412 | ck1 := cfg.makeClient() 413 | go ff(i, ck1) 414 | } 415 | 416 | cfg.leave(0) 417 | cfg.leave(2) 418 | time.Sleep(3000 * time.Millisecond) 419 | cfg.join(0) 420 | cfg.join(2) 421 | cfg.leave(1) 422 | time.Sleep(3000 * time.Millisecond) 423 | cfg.join(1) 424 | cfg.leave(0) 425 | cfg.leave(2) 426 | time.Sleep(3000 * time.Millisecond) 427 | 428 | cfg.ShutdownGroup(1) 429 | cfg.ShutdownGroup(2) 430 | time.Sleep(1000 * time.Millisecond) 431 | cfg.StartGroup(1) 432 | cfg.StartGroup(2) 433 | 434 | time.Sleep(2 * time.Second) 435 | 436 | atomic.StoreInt32(&done, 1) 437 | for i := 0; i < n; i++ { 438 | <-ch 439 | } 440 | 441 | for i := 0; i < n; i++ { 442 | check(t, ck, ka[i], va[i]) 443 | } 444 | 445 | fmt.Printf(" ... Passed\n") 446 | } 447 | 448 | func TestUnreliable1(t *testing.T) { 449 | fmt.Printf("Test: unreliable 1...\n") 450 | 451 | cfg := make_config(t, 3, true, 100) 452 | defer cfg.cleanup() 453 | 454 | ck := cfg.makeClient() 455 | 456 | cfg.join(0) 457 | 458 | n := 10 459 | ka := make([]string, n) 460 | va := make([]string, n) 461 | for i := 0; i < n; i++ { 462 | ka[i] = strconv.Itoa(i) // ensure multiple shards 463 | va[i] = randstring(5) 464 | ck.Put(ka[i], va[i]) 465 | } 466 | 467 | cfg.join(1) 468 | cfg.join(2) 469 | cfg.leave(0) 470 | 471 | for ii := 0; ii < n*2; ii++ { 472 | i := ii % n 473 | check(t, ck, ka[i], va[i]) 474 | x := randstring(5) 475 | ck.Append(ka[i], x) 476 | va[i] += x 477 | } 478 | 479 | cfg.join(0) 480 | cfg.leave(1) 481 | 482 | for ii := 0; ii < n*2; ii++ { 483 | i := ii % n 484 | check(t, ck, ka[i], va[i]) 485 | } 486 | 487 | fmt.Printf(" ... Passed\n") 488 | } 489 | 490 | func TestUnreliable2(t *testing.T) { 491 | fmt.Printf("Test: unreliable 2...\n") 492 | 493 | cfg := make_config(t, 3, true, 100) 494 | defer cfg.cleanup() 495 | 496 | ck := cfg.makeClient() 497 | 498 | cfg.join(0) 499 | 500 | n := 10 501 | ka := make([]string, n) 502 | va := make([]string, n) 503 | for i := 0; i < n; i++ { 504 | ka[i] = strconv.Itoa(i) // ensure multiple shards 505 | va[i] = randstring(5) 506 | ck.Put(ka[i], va[i]) 507 | } 508 | 509 | var done int32 510 | ch := make(chan bool) 511 | 512 | ff := func(i int) { 513 | defer func() { ch <- true }() 514 | ck1 := cfg.makeClient() 515 | for atomic.LoadInt32(&done) == 0 { 516 | x := randstring(5) 517 | ck1.Append(ka[i], x) 518 | va[i] += x 519 | } 520 | } 521 | 522 | for i := 0; i < n; i++ { 523 | go ff(i) 524 | } 525 | 526 | time.Sleep(150 * time.Millisecond) 527 | cfg.join(1) 528 | time.Sleep(500 * time.Millisecond) 529 | cfg.join(2) 530 | time.Sleep(500 * time.Millisecond) 531 | cfg.leave(0) 532 | time.Sleep(500 * time.Millisecond) 533 | cfg.leave(1) 534 | time.Sleep(500 * time.Millisecond) 535 | cfg.join(1) 536 | cfg.join(0) 537 | 538 | time.Sleep(2 * time.Second) 539 | 540 | atomic.StoreInt32(&done, 1) 541 | cfg.net.Reliable(true) 542 | for i := 0; i < n; i++ { 543 | <-ch 544 | } 545 | 546 | for i := 0; i < n; i++ { 547 | check(t, ck, ka[i], va[i]) 548 | } 549 | 550 | fmt.Printf(" ... Passed\n") 551 | } 552 | 553 | func TestUnreliable3(t *testing.T) { 554 | fmt.Printf("Test: unreliable 3...\n") 555 | 556 | cfg := make_config(t, 3, true, 100) 557 | defer cfg.cleanup() 558 | 559 | begin := time.Now() 560 | var operations []linearizability.Operation 561 | var opMu sync.Mutex 562 | 563 | ck := cfg.makeClient() 564 | 565 | cfg.join(0) 566 | 567 | n := 10 568 | ka := make([]string, n) 569 | va := make([]string, n) 570 | for i := 0; i < n; i++ { 571 | ka[i] = strconv.Itoa(i) // ensure multiple shards 572 | va[i] = randstring(5) 573 | start := int64(time.Since(begin)) 574 | ck.Put(ka[i], va[i]) 575 | end := int64(time.Since(begin)) 576 | inp := linearizability.KvInput{Op: 1, Key: ka[i], Value: va[i]} 577 | var out linearizability.KvOutput 578 | op := linearizability.Operation{Input: inp, Call: start, Output: out, Return: end} 579 | operations = append(operations, op) 580 | } 581 | 582 | var done int32 583 | ch := make(chan bool) 584 | 585 | ff := func(i int) { 586 | defer func() { ch <- true }() 587 | ck1 := cfg.makeClient() 588 | for atomic.LoadInt32(&done) == 0 { 589 | ki := rand.Int() % n 590 | nv := randstring(5) 591 | var inp linearizability.KvInput 592 | var out linearizability.KvOutput 593 | start := int64(time.Since(begin)) 594 | if (rand.Int() % 1000) < 500 { 595 | ck1.Append(ka[ki], nv) 596 | inp = linearizability.KvInput{Op: 2, Key: ka[ki], Value: nv} 597 | } else if (rand.Int() % 1000) < 100 { 598 | ck1.Put(ka[ki], nv) 599 | inp = linearizability.KvInput{Op: 1, Key: ka[ki], Value: nv} 600 | } else { 601 | v := ck1.Get(ka[ki]) 602 | inp = linearizability.KvInput{Op: 0, Key: ka[ki]} 603 | out = linearizability.KvOutput{Value: v} 604 | } 605 | end := int64(time.Since(begin)) 606 | op := linearizability.Operation{Input: inp, Call: start, Output: out, Return: end} 607 | opMu.Lock() 608 | operations = append(operations, op) 609 | opMu.Unlock() 610 | } 611 | } 612 | 613 | for i := 0; i < n; i++ { 614 | go ff(i) 615 | } 616 | 617 | time.Sleep(150 * time.Millisecond) 618 | cfg.join(1) 619 | time.Sleep(500 * time.Millisecond) 620 | cfg.join(2) 621 | time.Sleep(500 * time.Millisecond) 622 | cfg.leave(0) 623 | time.Sleep(500 * time.Millisecond) 624 | cfg.leave(1) 625 | time.Sleep(500 * time.Millisecond) 626 | cfg.join(1) 627 | cfg.join(0) 628 | 629 | time.Sleep(2 * time.Second) 630 | 631 | atomic.StoreInt32(&done, 1) 632 | cfg.net.Reliable(true) 633 | for i := 0; i < n; i++ { 634 | <-ch 635 | } 636 | 637 | // log.Printf("Checking linearizability of %d operations", len(operations)) 638 | // start := time.Now() 639 | ok := linearizability.CheckOperationsTimeout(linearizability.KvModel(), operations, linearizabilityCheckTimeout) 640 | // dur := time.Since(start) 641 | // log.Printf("Linearizability check done in %s; result: %t", time.Since(start).String(), ok) 642 | if !ok { 643 | t.Fatal("history is not linearizable") 644 | } 645 | 646 | fmt.Printf(" ... Passed\n") 647 | } 648 | 649 | // 650 | // optional test to see whether servers are deleting 651 | // shards for which they are no longer responsible. 652 | // 653 | func TestChallenge1Delete(t *testing.T) { 654 | fmt.Printf("Test: shard deletion (challenge 1) ...\n") 655 | 656 | // "1" means force snapshot after every log entry. 657 | cfg := make_config(t, 3, false, 1) 658 | defer cfg.cleanup() 659 | 660 | ck := cfg.makeClient() 661 | 662 | cfg.join(0) 663 | 664 | // 30,000 bytes of total values. 665 | n := 30 666 | ka := make([]string, n) 667 | va := make([]string, n) 668 | for i := 0; i < n; i++ { 669 | ka[i] = strconv.Itoa(i) 670 | va[i] = randstring(1000) 671 | ck.Put(ka[i], va[i]) 672 | } 673 | for i := 0; i < 3; i++ { 674 | check(t, ck, ka[i], va[i]) 675 | } 676 | 677 | for iters := 0; iters < 2; iters++ { 678 | cfg.join(1) 679 | cfg.leave(0) 680 | cfg.join(2) 681 | time.Sleep(3 * time.Second) 682 | for i := 0; i < 3; i++ { 683 | check(t, ck, ka[i], va[i]) 684 | } 685 | cfg.leave(1) 686 | cfg.join(0) 687 | cfg.leave(2) 688 | time.Sleep(3 * time.Second) 689 | for i := 0; i < 3; i++ { 690 | check(t, ck, ka[i], va[i]) 691 | } 692 | } 693 | 694 | cfg.join(1) 695 | cfg.join(2) 696 | time.Sleep(1 * time.Second) 697 | for i := 0; i < 3; i++ { 698 | check(t, ck, ka[i], va[i]) 699 | } 700 | time.Sleep(1 * time.Second) 701 | for i := 0; i < 3; i++ { 702 | check(t, ck, ka[i], va[i]) 703 | } 704 | time.Sleep(1 * time.Second) 705 | for i := 0; i < 3; i++ { 706 | check(t, ck, ka[i], va[i]) 707 | } 708 | 709 | total := 0 710 | for gi := 0; gi < cfg.ngroups; gi++ { 711 | for i := 0; i < cfg.n; i++ { 712 | raft := cfg.groups[gi].saved[i].RaftStateSize() 713 | snap := len(cfg.groups[gi].saved[i].ReadSnapshot()) 714 | total += raft + snap 715 | } 716 | } 717 | 718 | // 27 keys should be stored once. 719 | // 3 keys should also be stored in client dup tables. 720 | // everything on 3 replicas. 721 | // plus slop. 722 | expected := 3 * (((n - 3) * 1000) + 2*3*1000 + 6000) 723 | if total > expected { 724 | t.Fatalf("snapshot + persisted Raft state are too big: %v > %v\n", total, expected) 725 | } 726 | 727 | for i := 0; i < n; i++ { 728 | check(t, ck, ka[i], va[i]) 729 | } 730 | 731 | fmt.Printf(" ... Passed\n") 732 | } 733 | 734 | func TestChallenge1Concurrent(t *testing.T) { 735 | fmt.Printf("Test: concurrent configuration change and restart (challenge 1)...\n") 736 | 737 | cfg := make_config(t, 3, false, 300) 738 | defer cfg.cleanup() 739 | 740 | ck := cfg.makeClient() 741 | 742 | cfg.join(0) 743 | 744 | n := 10 745 | ka := make([]string, n) 746 | va := make([]string, n) 747 | for i := 0; i < n; i++ { 748 | ka[i] = strconv.Itoa(i) 749 | va[i] = randstring(1) 750 | ck.Put(ka[i], va[i]) 751 | } 752 | 753 | var done int32 754 | ch := make(chan bool) 755 | 756 | ff := func(i int, ck1 *Clerk) { 757 | defer func() { ch <- true }() 758 | for atomic.LoadInt32(&done) == 0 { 759 | x := randstring(1) 760 | ck1.Append(ka[i], x) 761 | va[i] += x 762 | } 763 | } 764 | 765 | for i := 0; i < n; i++ { 766 | ck1 := cfg.makeClient() 767 | go ff(i, ck1) 768 | } 769 | 770 | t0 := time.Now() 771 | for time.Since(t0) < 12*time.Second { 772 | cfg.join(2) 773 | cfg.join(1) 774 | time.Sleep(time.Duration(rand.Int()%900) * time.Millisecond) 775 | cfg.ShutdownGroup(0) 776 | cfg.ShutdownGroup(1) 777 | cfg.ShutdownGroup(2) 778 | cfg.StartGroup(0) 779 | cfg.StartGroup(1) 780 | cfg.StartGroup(2) 781 | 782 | time.Sleep(time.Duration(rand.Int()%900) * time.Millisecond) 783 | cfg.leave(1) 784 | cfg.leave(2) 785 | time.Sleep(time.Duration(rand.Int()%900) * time.Millisecond) 786 | } 787 | 788 | time.Sleep(2 * time.Second) 789 | 790 | atomic.StoreInt32(&done, 1) 791 | for i := 0; i < n; i++ { 792 | <-ch 793 | } 794 | 795 | for i := 0; i < n; i++ { 796 | check(t, ck, ka[i], va[i]) 797 | } 798 | 799 | fmt.Printf(" ... Passed\n") 800 | } 801 | 802 | // 803 | // optional test to see whether servers can handle 804 | // shards that are not affected by a config change 805 | // while the config change is underway 806 | // 807 | func TestChallenge2Unaffected(t *testing.T) { 808 | fmt.Printf("Test: unaffected shard access (challenge 2) ...\n") 809 | 810 | cfg := make_config(t, 3, true, 100) 811 | defer cfg.cleanup() 812 | 813 | ck := cfg.makeClient() 814 | 815 | // JOIN 100 816 | cfg.join(0) 817 | 818 | // Do a bunch of puts to keys in all shards 819 | n := 10 820 | ka := make([]string, n) 821 | va := make([]string, n) 822 | for i := 0; i < n; i++ { 823 | ka[i] = strconv.Itoa(i) // ensure multiple shards 824 | va[i] = "100" 825 | ck.Put(ka[i], va[i]) 826 | } 827 | 828 | // JOIN 101 829 | cfg.join(1) 830 | 831 | // QUERY to find shards now owned by 101 832 | c := cfg.mck.Query(-1) 833 | owned := make(map[int]bool, n) 834 | for s, gid := range c.Shards { 835 | owned[s] = gid == cfg.groups[1].gid 836 | } 837 | 838 | // Wait for migration to new config to complete, and for clients to 839 | // start using this updated config. Gets to any key k such that 840 | // owned[shard(k)] == true should now be served by group 101. 841 | <-time.After(1 * time.Second) 842 | for i := 0; i < n; i++ { 843 | if owned[i] { 844 | va[i] = "101" 845 | ck.Put(ka[i], va[i]) 846 | } 847 | } 848 | 849 | // KILL 100 850 | cfg.ShutdownGroup(0) 851 | 852 | // LEAVE 100 853 | // 101 doesn't get a chance to migrate things previously owned by 100 854 | cfg.leave(0) 855 | 856 | // Wait to make sure clients see new config 857 | <-time.After(1 * time.Second) 858 | 859 | // And finally: check that gets/puts for 101-owned keys still complete 860 | for i := 0; i < n; i++ { 861 | shard := int(ka[i][0]) % 10 862 | if owned[shard] { 863 | check(t, ck, ka[i], va[i]) 864 | ck.Put(ka[i], va[i]+"-1") 865 | check(t, ck, ka[i], va[i]+"-1") 866 | } 867 | } 868 | 869 | fmt.Printf(" ... Passed\n") 870 | } 871 | 872 | // 873 | // optional test to see whether servers can handle operations on shards that 874 | // have been received as a part of a config migration when the entire migration 875 | // has not yet completed. 876 | // 877 | func TestChallenge2Partial(t *testing.T) { 878 | fmt.Printf("Test: partial migration shard access (challenge 2) ...\n") 879 | 880 | cfg := make_config(t, 3, true, 100) 881 | defer cfg.cleanup() 882 | 883 | ck := cfg.makeClient() 884 | 885 | // JOIN 100 + 101 + 102 886 | cfg.joinm([]int{0, 1, 2}) 887 | 888 | // Give the implementation some time to reconfigure 889 | <-time.After(1 * time.Second) 890 | 891 | // Do a bunch of puts to keys in all shards 892 | n := 10 893 | ka := make([]string, n) 894 | va := make([]string, n) 895 | for i := 0; i < n; i++ { 896 | ka[i] = strconv.Itoa(i) // ensure multiple shards 897 | va[i] = "100" 898 | ck.Put(ka[i], va[i]) 899 | } 900 | 901 | // QUERY to find shards owned by 102 902 | c := cfg.mck.Query(-1) 903 | owned := make(map[int]bool, n) 904 | for s, gid := range c.Shards { 905 | owned[s] = gid == cfg.groups[2].gid 906 | } 907 | 908 | // KILL 100 909 | cfg.ShutdownGroup(0) 910 | 911 | // LEAVE 100 + 102 912 | // 101 can get old shards from 102, but not from 100. 101 should start 913 | // serving shards that used to belong to 102 as soon as possible 914 | cfg.leavem([]int{0, 2}) 915 | 916 | // Give the implementation some time to start reconfiguration 917 | // And to migrate 102 -> 101 918 | <-time.After(1 * time.Second) 919 | 920 | // And finally: check that gets/puts for 101-owned keys now complete 921 | for i := 0; i < n; i++ { 922 | shard := key2shard(ka[i]) 923 | if owned[shard] { 924 | check(t, ck, ka[i], va[i]) 925 | ck.Put(ka[i], va[i]+"-2") 926 | check(t, ck, ka[i], va[i]+"-2") 927 | } 928 | } 929 | 930 | fmt.Printf(" ... Passed\n") 931 | } 932 | -------------------------------------------------------------------------------- /src/shardmaster/client.go: -------------------------------------------------------------------------------- 1 | package shardmaster 2 | 3 | // 4 | // Shardmaster clerk. 5 | // 6 | 7 | import "labrpc" 8 | import "time" 9 | import "crypto/rand" 10 | import "math/big" 11 | 12 | type Clerk struct { 13 | servers []*labrpc.ClientEnd 14 | // Your data here. 15 | } 16 | 17 | func nrand() int64 { 18 | max := big.NewInt(int64(1) << 62) 19 | bigx, _ := rand.Int(rand.Reader, max) 20 | x := bigx.Int64() 21 | return x 22 | } 23 | 24 | func MakeClerk(servers []*labrpc.ClientEnd) *Clerk { 25 | ck := new(Clerk) 26 | ck.servers = servers 27 | // Your code here. 28 | return ck 29 | } 30 | 31 | func (ck *Clerk) Query(num int) Config { 32 | args := &QueryArgs{} 33 | // Your code here. 34 | args.Num = num 35 | for { 36 | // try each known server. 37 | for _, srv := range ck.servers { 38 | var reply QueryReply 39 | ok := srv.Call("ShardMaster.Query", args, &reply) 40 | if ok && reply.WrongLeader == false { 41 | return reply.Config 42 | } 43 | } 44 | time.Sleep(100 * time.Millisecond) 45 | } 46 | } 47 | 48 | func (ck *Clerk) Join(servers map[int][]string) { 49 | args := &JoinArgs{} 50 | // Your code here. 51 | args.Servers = servers 52 | 53 | for { 54 | // try each known server. 55 | for _, srv := range ck.servers { 56 | var reply JoinReply 57 | ok := srv.Call("ShardMaster.Join", args, &reply) 58 | if ok && reply.WrongLeader == false { 59 | return 60 | } 61 | } 62 | time.Sleep(100 * time.Millisecond) 63 | } 64 | } 65 | 66 | func (ck *Clerk) Leave(gids []int) { 67 | args := &LeaveArgs{} 68 | // Your code here. 69 | args.GIDs = gids 70 | 71 | for { 72 | // try each known server. 73 | for _, srv := range ck.servers { 74 | var reply LeaveReply 75 | ok := srv.Call("ShardMaster.Leave", args, &reply) 76 | if ok && reply.WrongLeader == false { 77 | return 78 | } 79 | } 80 | time.Sleep(100 * time.Millisecond) 81 | } 82 | } 83 | 84 | func (ck *Clerk) Move(shard int, gid int) { 85 | args := &MoveArgs{} 86 | // Your code here. 87 | args.Shard = shard 88 | args.GID = gid 89 | 90 | for { 91 | // try each known server. 92 | for _, srv := range ck.servers { 93 | var reply MoveReply 94 | ok := srv.Call("ShardMaster.Move", args, &reply) 95 | if ok && reply.WrongLeader == false { 96 | return 97 | } 98 | } 99 | time.Sleep(100 * time.Millisecond) 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/shardmaster/common.go: -------------------------------------------------------------------------------- 1 | package shardmaster 2 | 3 | // 4 | // Master shard server: assigns shards to replication groups. 5 | // 6 | // RPC interface: 7 | // Join(servers) -- add a set of groups (gid -> server-list mapping). 8 | // Leave(gids) -- delete a set of groups. 9 | // Move(shard, gid) -- hand off one shard from current owner to gid. 10 | // Query(num) -> fetch Config # num, or latest config if num==-1. 11 | // 12 | // A Config (configuration) describes a set of replica groups, and the 13 | // replica group responsible for each shard. Configs are numbered. Config 14 | // #0 is the initial configuration, with no groups and all shards 15 | // assigned to group 0 (the invalid group). 16 | // 17 | // You will need to add fields to the RPC argument structs. 18 | // 19 | 20 | // The number of shards. 21 | const NShards = 10 22 | 23 | // A configuration -- an assignment of shards to groups. 24 | // Please don't change this. 25 | type Config struct { 26 | Num int // config number 27 | Shards [NShards]int // shard -> gid 28 | Groups map[int][]string // gid -> servers[] 29 | } 30 | 31 | const ( 32 | OK = "OK" 33 | ) 34 | 35 | type Err string 36 | 37 | type JoinArgs struct { 38 | Servers map[int][]string // new GID -> servers mappings 39 | } 40 | 41 | type JoinReply struct { 42 | WrongLeader bool 43 | Err Err 44 | } 45 | 46 | type LeaveArgs struct { 47 | GIDs []int 48 | } 49 | 50 | type LeaveReply struct { 51 | WrongLeader bool 52 | Err Err 53 | } 54 | 55 | type MoveArgs struct { 56 | Shard int 57 | GID int 58 | } 59 | 60 | type MoveReply struct { 61 | WrongLeader bool 62 | Err Err 63 | } 64 | 65 | type QueryArgs struct { 66 | Num int // desired config number 67 | } 68 | 69 | type QueryReply struct { 70 | WrongLeader bool 71 | Err Err 72 | Config Config 73 | } 74 | -------------------------------------------------------------------------------- /src/shardmaster/config.go: -------------------------------------------------------------------------------- 1 | package shardmaster 2 | 3 | import "labrpc" 4 | import "raft" 5 | import "testing" 6 | import "os" 7 | 8 | // import "log" 9 | import crand "crypto/rand" 10 | import "math/rand" 11 | import "encoding/base64" 12 | import "sync" 13 | import "runtime" 14 | import "time" 15 | 16 | func randstring(n int) string { 17 | b := make([]byte, 2*n) 18 | crand.Read(b) 19 | s := base64.URLEncoding.EncodeToString(b) 20 | return s[0:n] 21 | } 22 | 23 | // Randomize server handles 24 | func random_handles(kvh []*labrpc.ClientEnd) []*labrpc.ClientEnd { 25 | sa := make([]*labrpc.ClientEnd, len(kvh)) 26 | copy(sa, kvh) 27 | for i := range sa { 28 | j := rand.Intn(i + 1) 29 | sa[i], sa[j] = sa[j], sa[i] 30 | } 31 | return sa 32 | } 33 | 34 | type config struct { 35 | mu sync.Mutex 36 | t *testing.T 37 | net *labrpc.Network 38 | n int 39 | servers []*ShardMaster 40 | saved []*raft.Persister 41 | endnames [][]string // names of each server's sending ClientEnds 42 | clerks map[*Clerk][]string 43 | nextClientId int 44 | start time.Time // time at which make_config() was called 45 | } 46 | 47 | func (cfg *config) checkTimeout() { 48 | // enforce a two minute real-time limit on each test 49 | if !cfg.t.Failed() && time.Since(cfg.start) > 120*time.Second { 50 | cfg.t.Fatal("test took longer than 120 seconds") 51 | } 52 | } 53 | 54 | func (cfg *config) cleanup() { 55 | cfg.mu.Lock() 56 | defer cfg.mu.Unlock() 57 | for i := 0; i < len(cfg.servers); i++ { 58 | if cfg.servers[i] != nil { 59 | cfg.servers[i].Kill() 60 | } 61 | } 62 | cfg.net.Cleanup() 63 | cfg.checkTimeout() 64 | } 65 | 66 | // Maximum log size across all servers 67 | func (cfg *config) LogSize() int { 68 | logsize := 0 69 | for i := 0; i < cfg.n; i++ { 70 | n := cfg.saved[i].RaftStateSize() 71 | if n > logsize { 72 | logsize = n 73 | } 74 | } 75 | return logsize 76 | } 77 | 78 | // attach server i to servers listed in to 79 | // caller must hold cfg.mu 80 | func (cfg *config) connectUnlocked(i int, to []int) { 81 | // log.Printf("connect peer %d to %v\n", i, to) 82 | 83 | // outgoing socket files 84 | for j := 0; j < len(to); j++ { 85 | endname := cfg.endnames[i][to[j]] 86 | cfg.net.Enable(endname, true) 87 | } 88 | 89 | // incoming socket files 90 | for j := 0; j < len(to); j++ { 91 | endname := cfg.endnames[to[j]][i] 92 | cfg.net.Enable(endname, true) 93 | } 94 | } 95 | 96 | func (cfg *config) connect(i int, to []int) { 97 | cfg.mu.Lock() 98 | defer cfg.mu.Unlock() 99 | cfg.connectUnlocked(i, to) 100 | } 101 | 102 | // detach server i from the servers listed in from 103 | // caller must hold cfg.mu 104 | func (cfg *config) disconnectUnlocked(i int, from []int) { 105 | // log.Printf("disconnect peer %d from %v\n", i, from) 106 | 107 | // outgoing socket files 108 | for j := 0; j < len(from); j++ { 109 | if cfg.endnames[i] != nil { 110 | endname := cfg.endnames[i][from[j]] 111 | cfg.net.Enable(endname, false) 112 | } 113 | } 114 | 115 | // incoming socket files 116 | for j := 0; j < len(from); j++ { 117 | if cfg.endnames[j] != nil { 118 | endname := cfg.endnames[from[j]][i] 119 | cfg.net.Enable(endname, false) 120 | } 121 | } 122 | } 123 | 124 | func (cfg *config) disconnect(i int, from []int) { 125 | cfg.mu.Lock() 126 | defer cfg.mu.Unlock() 127 | cfg.disconnectUnlocked(i, from) 128 | } 129 | 130 | func (cfg *config) All() []int { 131 | all := make([]int, cfg.n) 132 | for i := 0; i < cfg.n; i++ { 133 | all[i] = i 134 | } 135 | return all 136 | } 137 | 138 | func (cfg *config) ConnectAll() { 139 | cfg.mu.Lock() 140 | defer cfg.mu.Unlock() 141 | for i := 0; i < cfg.n; i++ { 142 | cfg.connectUnlocked(i, cfg.All()) 143 | } 144 | } 145 | 146 | // Sets up 2 partitions with connectivity between servers in each partition. 147 | func (cfg *config) partition(p1 []int, p2 []int) { 148 | cfg.mu.Lock() 149 | defer cfg.mu.Unlock() 150 | // log.Printf("partition servers into: %v %v\n", p1, p2) 151 | for i := 0; i < len(p1); i++ { 152 | cfg.disconnectUnlocked(p1[i], p2) 153 | cfg.connectUnlocked(p1[i], p1) 154 | } 155 | for i := 0; i < len(p2); i++ { 156 | cfg.disconnectUnlocked(p2[i], p1) 157 | cfg.connectUnlocked(p2[i], p2) 158 | } 159 | } 160 | 161 | // Create a clerk with clerk specific server names. 162 | // Give it connections to all of the servers, but for 163 | // now enable only connections to servers in to[]. 164 | func (cfg *config) makeClient(to []int) *Clerk { 165 | cfg.mu.Lock() 166 | defer cfg.mu.Unlock() 167 | 168 | // a fresh set of ClientEnds. 169 | ends := make([]*labrpc.ClientEnd, cfg.n) 170 | endnames := make([]string, cfg.n) 171 | for j := 0; j < cfg.n; j++ { 172 | endnames[j] = randstring(20) 173 | ends[j] = cfg.net.MakeEnd(endnames[j]) 174 | cfg.net.Connect(endnames[j], j) 175 | } 176 | 177 | ck := MakeClerk(random_handles(ends)) 178 | cfg.clerks[ck] = endnames 179 | cfg.nextClientId++ 180 | cfg.ConnectClientUnlocked(ck, to) 181 | return ck 182 | } 183 | 184 | func (cfg *config) deleteClient(ck *Clerk) { 185 | cfg.mu.Lock() 186 | defer cfg.mu.Unlock() 187 | 188 | v := cfg.clerks[ck] 189 | for i := 0; i < len(v); i++ { 190 | os.Remove(v[i]) 191 | } 192 | delete(cfg.clerks, ck) 193 | } 194 | 195 | // caller should hold cfg.mu 196 | func (cfg *config) ConnectClientUnlocked(ck *Clerk, to []int) { 197 | // log.Printf("ConnectClient %v to %v\n", ck, to) 198 | endnames := cfg.clerks[ck] 199 | for j := 0; j < len(to); j++ { 200 | s := endnames[to[j]] 201 | cfg.net.Enable(s, true) 202 | } 203 | } 204 | 205 | func (cfg *config) ConnectClient(ck *Clerk, to []int) { 206 | cfg.mu.Lock() 207 | defer cfg.mu.Unlock() 208 | cfg.ConnectClientUnlocked(ck, to) 209 | } 210 | 211 | // caller should hold cfg.mu 212 | func (cfg *config) DisconnectClientUnlocked(ck *Clerk, from []int) { 213 | // log.Printf("DisconnectClient %v from %v\n", ck, from) 214 | endnames := cfg.clerks[ck] 215 | for j := 0; j < len(from); j++ { 216 | s := endnames[from[j]] 217 | cfg.net.Enable(s, false) 218 | } 219 | } 220 | 221 | func (cfg *config) DisconnectClient(ck *Clerk, from []int) { 222 | cfg.mu.Lock() 223 | defer cfg.mu.Unlock() 224 | cfg.DisconnectClientUnlocked(ck, from) 225 | } 226 | 227 | // Shutdown a server by isolating it 228 | func (cfg *config) ShutdownServer(i int) { 229 | cfg.mu.Lock() 230 | defer cfg.mu.Unlock() 231 | 232 | cfg.disconnectUnlocked(i, cfg.All()) 233 | 234 | // disable client connections to the server. 235 | // it's important to do this before creating 236 | // the new Persister in saved[i], to avoid 237 | // the possibility of the server returning a 238 | // positive reply to an Append but persisting 239 | // the result in the superseded Persister. 240 | cfg.net.DeleteServer(i) 241 | 242 | // a fresh persister, in case old instance 243 | // continues to update the Persister. 244 | // but copy old persister's content so that we always 245 | // pass Make() the last persisted state. 246 | if cfg.saved[i] != nil { 247 | cfg.saved[i] = cfg.saved[i].Copy() 248 | } 249 | 250 | kv := cfg.servers[i] 251 | if kv != nil { 252 | cfg.mu.Unlock() 253 | kv.Kill() 254 | cfg.mu.Lock() 255 | cfg.servers[i] = nil 256 | } 257 | } 258 | 259 | // If restart servers, first call ShutdownServer 260 | func (cfg *config) StartServer(i int) { 261 | cfg.mu.Lock() 262 | 263 | // a fresh set of outgoing ClientEnd names. 264 | cfg.endnames[i] = make([]string, cfg.n) 265 | for j := 0; j < cfg.n; j++ { 266 | cfg.endnames[i][j] = randstring(20) 267 | } 268 | 269 | // a fresh set of ClientEnds. 270 | ends := make([]*labrpc.ClientEnd, cfg.n) 271 | for j := 0; j < cfg.n; j++ { 272 | ends[j] = cfg.net.MakeEnd(cfg.endnames[i][j]) 273 | cfg.net.Connect(cfg.endnames[i][j], j) 274 | } 275 | 276 | // a fresh persister, so old instance doesn't overwrite 277 | // new instance's persisted state. 278 | // give the fresh persister a copy of the old persister's 279 | // state, so that the spec is that we pass StartKVServer() 280 | // the last persisted state. 281 | if cfg.saved[i] != nil { 282 | cfg.saved[i] = cfg.saved[i].Copy() 283 | } else { 284 | cfg.saved[i] = raft.MakePersister() 285 | } 286 | 287 | cfg.mu.Unlock() 288 | 289 | cfg.servers[i] = StartServer(ends, i, cfg.saved[i]) 290 | 291 | kvsvc := labrpc.MakeService(cfg.servers[i]) 292 | rfsvc := labrpc.MakeService(cfg.servers[i].rf) 293 | srv := labrpc.MakeServer() 294 | srv.AddService(kvsvc) 295 | srv.AddService(rfsvc) 296 | cfg.net.AddServer(i, srv) 297 | } 298 | 299 | func (cfg *config) Leader() (bool, int) { 300 | cfg.mu.Lock() 301 | defer cfg.mu.Unlock() 302 | 303 | for i := 0; i < cfg.n; i++ { 304 | _, is_leader := cfg.servers[i].rf.GetState() 305 | if is_leader { 306 | return true, i 307 | } 308 | } 309 | return false, 0 310 | } 311 | 312 | // Partition servers into 2 groups and put current leader in minority 313 | func (cfg *config) make_partition() ([]int, []int) { 314 | _, l := cfg.Leader() 315 | p1 := make([]int, cfg.n/2+1) 316 | p2 := make([]int, cfg.n/2) 317 | j := 0 318 | for i := 0; i < cfg.n; i++ { 319 | if i != l { 320 | if j < len(p1) { 321 | p1[j] = i 322 | } else { 323 | p2[j-len(p1)] = i 324 | } 325 | j++ 326 | } 327 | } 328 | p2[len(p2)-1] = l 329 | return p1, p2 330 | } 331 | 332 | func make_config(t *testing.T, n int, unreliable bool) *config { 333 | runtime.GOMAXPROCS(4) 334 | cfg := &config{} 335 | cfg.t = t 336 | cfg.net = labrpc.MakeNetwork() 337 | cfg.n = n 338 | cfg.servers = make([]*ShardMaster, cfg.n) 339 | cfg.saved = make([]*raft.Persister, cfg.n) 340 | cfg.endnames = make([][]string, cfg.n) 341 | cfg.clerks = make(map[*Clerk][]string) 342 | cfg.nextClientId = cfg.n + 1000 // client ids start 1000 above the highest serverid 343 | cfg.start = time.Now() 344 | 345 | // create a full set of KV servers. 346 | for i := 0; i < cfg.n; i++ { 347 | cfg.StartServer(i) 348 | } 349 | 350 | cfg.ConnectAll() 351 | 352 | cfg.net.Reliable(!unreliable) 353 | 354 | return cfg 355 | } 356 | -------------------------------------------------------------------------------- /src/shardmaster/server.go: -------------------------------------------------------------------------------- 1 | package shardmaster 2 | 3 | import "raft" 4 | import "labrpc" 5 | import "sync" 6 | import "labgob" 7 | 8 | type ShardMaster struct { 9 | mu sync.Mutex 10 | me int 11 | rf *raft.Raft 12 | applyCh chan raft.ApplyMsg 13 | 14 | // Your data here. 15 | 16 | configs []Config // indexed by config num 17 | } 18 | 19 | type Op struct { 20 | // Your data here. 21 | } 22 | 23 | func (sm *ShardMaster) Join(args *JoinArgs, reply *JoinReply) { 24 | // Your code here. 25 | } 26 | 27 | func (sm *ShardMaster) Leave(args *LeaveArgs, reply *LeaveReply) { 28 | // Your code here. 29 | } 30 | 31 | func (sm *ShardMaster) Move(args *MoveArgs, reply *MoveReply) { 32 | // Your code here. 33 | } 34 | 35 | func (sm *ShardMaster) Query(args *QueryArgs, reply *QueryReply) { 36 | // Your code here. 37 | } 38 | 39 | // 40 | // the tester calls Kill() when a ShardMaster instance won't 41 | // be needed again. you are not required to do anything 42 | // in Kill(), but it might be convenient to (for example) 43 | // turn off debug output from this instance. 44 | // 45 | func (sm *ShardMaster) Kill() { 46 | sm.rf.Kill() 47 | // Your code here, if desired. 48 | } 49 | 50 | // needed by shardkv tester 51 | func (sm *ShardMaster) Raft() *raft.Raft { 52 | return sm.rf 53 | } 54 | 55 | // 56 | // servers[] contains the ports of the set of 57 | // servers that will cooperate via Paxos to 58 | // form the fault-tolerant shardmaster service. 59 | // me is the index of the current server in servers[]. 60 | // 61 | func StartServer(servers []*labrpc.ClientEnd, me int, persister *raft.Persister) *ShardMaster { 62 | sm := new(ShardMaster) 63 | sm.me = me 64 | 65 | sm.configs = make([]Config, 1) 66 | sm.configs[0].Groups = map[int][]string{} 67 | 68 | labgob.Register(Op{}) 69 | sm.applyCh = make(chan raft.ApplyMsg) 70 | sm.rf = raft.Make(servers, me, persister, sm.applyCh) 71 | 72 | // Your code here. 73 | 74 | return sm 75 | } 76 | -------------------------------------------------------------------------------- /src/shardmaster/test_test.go: -------------------------------------------------------------------------------- 1 | package shardmaster 2 | 3 | import ( 4 | "sync" 5 | "testing" 6 | ) 7 | 8 | // import "time" 9 | import "fmt" 10 | 11 | func check(t *testing.T, groups []int, ck *Clerk) { 12 | c := ck.Query(-1) 13 | if len(c.Groups) != len(groups) { 14 | t.Fatalf("wanted %v groups, got %v", len(groups), len(c.Groups)) 15 | } 16 | 17 | // are the groups as expected? 18 | for _, g := range groups { 19 | _, ok := c.Groups[g] 20 | if ok != true { 21 | t.Fatalf("missing group %v", g) 22 | } 23 | } 24 | 25 | // any un-allocated shards? 26 | if len(groups) > 0 { 27 | for s, g := range c.Shards { 28 | _, ok := c.Groups[g] 29 | if ok == false { 30 | t.Fatalf("shard %v -> invalid group %v", s, g) 31 | } 32 | } 33 | } 34 | 35 | // more or less balanced sharding? 36 | counts := map[int]int{} 37 | for _, g := range c.Shards { 38 | counts[g] += 1 39 | } 40 | min := 257 41 | max := 0 42 | for g, _ := range c.Groups { 43 | if counts[g] > max { 44 | max = counts[g] 45 | } 46 | if counts[g] < min { 47 | min = counts[g] 48 | } 49 | } 50 | if max > min+1 { 51 | t.Fatalf("max %v too much larger than min %v", max, min) 52 | } 53 | } 54 | 55 | func check_same_config(t *testing.T, c1 Config, c2 Config) { 56 | if c1.Num != c2.Num { 57 | t.Fatalf("Num wrong") 58 | } 59 | if c1.Shards != c2.Shards { 60 | t.Fatalf("Shards wrong") 61 | } 62 | if len(c1.Groups) != len(c2.Groups) { 63 | t.Fatalf("number of Groups is wrong") 64 | } 65 | for gid, sa := range c1.Groups { 66 | sa1, ok := c2.Groups[gid] 67 | if ok == false || len(sa1) != len(sa) { 68 | t.Fatalf("len(Groups) wrong") 69 | } 70 | if ok && len(sa1) == len(sa) { 71 | for j := 0; j < len(sa); j++ { 72 | if sa[j] != sa1[j] { 73 | t.Fatalf("Groups wrong") 74 | } 75 | } 76 | } 77 | } 78 | } 79 | 80 | func TestBasic(t *testing.T) { 81 | const nservers = 3 82 | cfg := make_config(t, nservers, false) 83 | defer cfg.cleanup() 84 | 85 | ck := cfg.makeClient(cfg.All()) 86 | 87 | fmt.Printf("Test: Basic leave/join ...\n") 88 | 89 | cfa := make([]Config, 6) 90 | cfa[0] = ck.Query(-1) 91 | 92 | check(t, []int{}, ck) 93 | 94 | var gid1 int = 1 95 | ck.Join(map[int][]string{gid1: []string{"x", "y", "z"}}) 96 | check(t, []int{gid1}, ck) 97 | cfa[1] = ck.Query(-1) 98 | 99 | var gid2 int = 2 100 | ck.Join(map[int][]string{gid2: []string{"a", "b", "c"}}) 101 | check(t, []int{gid1, gid2}, ck) 102 | cfa[2] = ck.Query(-1) 103 | 104 | cfx := ck.Query(-1) 105 | sa1 := cfx.Groups[gid1] 106 | if len(sa1) != 3 || sa1[0] != "x" || sa1[1] != "y" || sa1[2] != "z" { 107 | t.Fatalf("wrong servers for gid %v: %v\n", gid1, sa1) 108 | } 109 | sa2 := cfx.Groups[gid2] 110 | if len(sa2) != 3 || sa2[0] != "a" || sa2[1] != "b" || sa2[2] != "c" { 111 | t.Fatalf("wrong servers for gid %v: %v\n", gid2, sa2) 112 | } 113 | 114 | ck.Leave([]int{gid1}) 115 | check(t, []int{gid2}, ck) 116 | cfa[4] = ck.Query(-1) 117 | 118 | ck.Leave([]int{gid2}) 119 | cfa[5] = ck.Query(-1) 120 | 121 | fmt.Printf(" ... Passed\n") 122 | 123 | fmt.Printf("Test: Historical queries ...\n") 124 | 125 | for s := 0; s < nservers; s++ { 126 | cfg.ShutdownServer(s) 127 | for i := 0; i < len(cfa); i++ { 128 | c := ck.Query(cfa[i].Num) 129 | check_same_config(t, c, cfa[i]) 130 | } 131 | cfg.StartServer(s) 132 | cfg.ConnectAll() 133 | } 134 | 135 | fmt.Printf(" ... Passed\n") 136 | 137 | fmt.Printf("Test: Move ...\n") 138 | { 139 | var gid3 int = 503 140 | ck.Join(map[int][]string{gid3: []string{"3a", "3b", "3c"}}) 141 | var gid4 int = 504 142 | ck.Join(map[int][]string{gid4: []string{"4a", "4b", "4c"}}) 143 | for i := 0; i < NShards; i++ { 144 | cf := ck.Query(-1) 145 | if i < NShards/2 { 146 | ck.Move(i, gid3) 147 | if cf.Shards[i] != gid3 { 148 | cf1 := ck.Query(-1) 149 | if cf1.Num <= cf.Num { 150 | t.Fatalf("Move should increase Config.Num") 151 | } 152 | } 153 | } else { 154 | ck.Move(i, gid4) 155 | if cf.Shards[i] != gid4 { 156 | cf1 := ck.Query(-1) 157 | if cf1.Num <= cf.Num { 158 | t.Fatalf("Move should increase Config.Num") 159 | } 160 | } 161 | } 162 | } 163 | cf2 := ck.Query(-1) 164 | for i := 0; i < NShards; i++ { 165 | if i < NShards/2 { 166 | if cf2.Shards[i] != gid3 { 167 | t.Fatalf("expected shard %v on gid %v actually %v", 168 | i, gid3, cf2.Shards[i]) 169 | } 170 | } else { 171 | if cf2.Shards[i] != gid4 { 172 | t.Fatalf("expected shard %v on gid %v actually %v", 173 | i, gid4, cf2.Shards[i]) 174 | } 175 | } 176 | } 177 | ck.Leave([]int{gid3}) 178 | ck.Leave([]int{gid4}) 179 | } 180 | fmt.Printf(" ... Passed\n") 181 | 182 | fmt.Printf("Test: Concurrent leave/join ...\n") 183 | 184 | const npara = 10 185 | var cka [npara]*Clerk 186 | for i := 0; i < len(cka); i++ { 187 | cka[i] = cfg.makeClient(cfg.All()) 188 | } 189 | gids := make([]int, npara) 190 | ch := make(chan bool) 191 | for xi := 0; xi < npara; xi++ { 192 | gids[xi] = int((xi * 10) + 100) 193 | go func(i int) { 194 | defer func() { ch <- true }() 195 | var gid int = gids[i] 196 | var sid1 = fmt.Sprintf("s%da", gid) 197 | var sid2 = fmt.Sprintf("s%db", gid) 198 | cka[i].Join(map[int][]string{gid + 1000: []string{sid1}}) 199 | cka[i].Join(map[int][]string{gid: []string{sid2}}) 200 | cka[i].Leave([]int{gid + 1000}) 201 | }(xi) 202 | } 203 | for i := 0; i < npara; i++ { 204 | <-ch 205 | } 206 | check(t, gids, ck) 207 | 208 | fmt.Printf(" ... Passed\n") 209 | 210 | fmt.Printf("Test: Minimal transfers after joins ...\n") 211 | 212 | c1 := ck.Query(-1) 213 | for i := 0; i < 5; i++ { 214 | var gid = int(npara + 1 + i) 215 | ck.Join(map[int][]string{gid: []string{ 216 | fmt.Sprintf("%da", gid), 217 | fmt.Sprintf("%db", gid), 218 | fmt.Sprintf("%db", gid)}}) 219 | } 220 | c2 := ck.Query(-1) 221 | for i := int(1); i <= npara; i++ { 222 | for j := 0; j < len(c1.Shards); j++ { 223 | if c2.Shards[j] == i { 224 | if c1.Shards[j] != i { 225 | t.Fatalf("non-minimal transfer after Join()s") 226 | } 227 | } 228 | } 229 | } 230 | 231 | fmt.Printf(" ... Passed\n") 232 | 233 | fmt.Printf("Test: Minimal transfers after leaves ...\n") 234 | 235 | for i := 0; i < 5; i++ { 236 | ck.Leave([]int{int(npara + 1 + i)}) 237 | } 238 | c3 := ck.Query(-1) 239 | for i := int(1); i <= npara; i++ { 240 | for j := 0; j < len(c1.Shards); j++ { 241 | if c2.Shards[j] == i { 242 | if c3.Shards[j] != i { 243 | t.Fatalf("non-minimal transfer after Leave()s") 244 | } 245 | } 246 | } 247 | } 248 | 249 | fmt.Printf(" ... Passed\n") 250 | } 251 | 252 | func TestMulti(t *testing.T) { 253 | const nservers = 3 254 | cfg := make_config(t, nservers, false) 255 | defer cfg.cleanup() 256 | 257 | ck := cfg.makeClient(cfg.All()) 258 | 259 | fmt.Printf("Test: Multi-group join/leave ...\n") 260 | 261 | cfa := make([]Config, 6) 262 | cfa[0] = ck.Query(-1) 263 | 264 | check(t, []int{}, ck) 265 | 266 | var gid1 int = 1 267 | var gid2 int = 2 268 | ck.Join(map[int][]string{ 269 | gid1: []string{"x", "y", "z"}, 270 | gid2: []string{"a", "b", "c"}, 271 | }) 272 | check(t, []int{gid1, gid2}, ck) 273 | cfa[1] = ck.Query(-1) 274 | 275 | var gid3 int = 3 276 | ck.Join(map[int][]string{gid3: []string{"j", "k", "l"}}) 277 | check(t, []int{gid1, gid2, gid3}, ck) 278 | cfa[2] = ck.Query(-1) 279 | 280 | cfx := ck.Query(-1) 281 | sa1 := cfx.Groups[gid1] 282 | if len(sa1) != 3 || sa1[0] != "x" || sa1[1] != "y" || sa1[2] != "z" { 283 | t.Fatalf("wrong servers for gid %v: %v\n", gid1, sa1) 284 | } 285 | sa2 := cfx.Groups[gid2] 286 | if len(sa2) != 3 || sa2[0] != "a" || sa2[1] != "b" || sa2[2] != "c" { 287 | t.Fatalf("wrong servers for gid %v: %v\n", gid2, sa2) 288 | } 289 | sa3 := cfx.Groups[gid3] 290 | if len(sa3) != 3 || sa3[0] != "j" || sa3[1] != "k" || sa3[2] != "l" { 291 | t.Fatalf("wrong servers for gid %v: %v\n", gid3, sa3) 292 | } 293 | 294 | ck.Leave([]int{gid1, gid3}) 295 | check(t, []int{gid2}, ck) 296 | cfa[3] = ck.Query(-1) 297 | 298 | cfx = ck.Query(-1) 299 | sa2 = cfx.Groups[gid2] 300 | if len(sa2) != 3 || sa2[0] != "a" || sa2[1] != "b" || sa2[2] != "c" { 301 | t.Fatalf("wrong servers for gid %v: %v\n", gid2, sa2) 302 | } 303 | 304 | ck.Leave([]int{gid2}) 305 | 306 | fmt.Printf(" ... Passed\n") 307 | 308 | fmt.Printf("Test: Concurrent multi leave/join ...\n") 309 | 310 | const npara = 10 311 | var cka [npara]*Clerk 312 | for i := 0; i < len(cka); i++ { 313 | cka[i] = cfg.makeClient(cfg.All()) 314 | } 315 | gids := make([]int, npara) 316 | var wg sync.WaitGroup 317 | for xi := 0; xi < npara; xi++ { 318 | wg.Add(1) 319 | gids[xi] = int(xi + 1000) 320 | go func(i int) { 321 | defer wg.Done() 322 | var gid int = gids[i] 323 | cka[i].Join(map[int][]string{ 324 | gid: []string{ 325 | fmt.Sprintf("%da", gid), 326 | fmt.Sprintf("%db", gid), 327 | fmt.Sprintf("%dc", gid)}, 328 | gid + 1000: []string{fmt.Sprintf("%da", gid+1000)}, 329 | gid + 2000: []string{fmt.Sprintf("%da", gid+2000)}, 330 | }) 331 | cka[i].Leave([]int{gid + 1000, gid + 2000}) 332 | }(xi) 333 | } 334 | wg.Wait() 335 | check(t, gids, ck) 336 | 337 | fmt.Printf(" ... Passed\n") 338 | 339 | fmt.Printf("Test: Minimal transfers after multijoins ...\n") 340 | 341 | c1 := ck.Query(-1) 342 | m := make(map[int][]string) 343 | for i := 0; i < 5; i++ { 344 | var gid = npara + 1 + i 345 | m[gid] = []string{fmt.Sprintf("%da", gid), fmt.Sprintf("%db", gid)} 346 | } 347 | ck.Join(m) 348 | c2 := ck.Query(-1) 349 | for i := int(1); i <= npara; i++ { 350 | for j := 0; j < len(c1.Shards); j++ { 351 | if c2.Shards[j] == i { 352 | if c1.Shards[j] != i { 353 | t.Fatalf("non-minimal transfer after Join()s") 354 | } 355 | } 356 | } 357 | } 358 | 359 | fmt.Printf(" ... Passed\n") 360 | 361 | fmt.Printf("Test: Minimal transfers after multileaves ...\n") 362 | 363 | var l []int 364 | for i := 0; i < 5; i++ { 365 | l = append(l, npara+1+i) 366 | } 367 | ck.Leave(l) 368 | c3 := ck.Query(-1) 369 | for i := int(1); i <= npara; i++ { 370 | for j := 0; j < len(c1.Shards); j++ { 371 | if c2.Shards[j] == i { 372 | if c3.Shards[j] != i { 373 | t.Fatalf("non-minimal transfer after Leave()s") 374 | } 375 | } 376 | } 377 | } 378 | 379 | fmt.Printf(" ... Passed\n") 380 | } 381 | --------------------------------------------------------------------------------